diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index f6ab388e2a..ccdfb0f6fb 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,8 +1,8 @@
-* @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj
+* @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @tenpercent @ThomasNing @coderfeli
 # Documentation files
-docs/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj
-*.md @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj
-*.rst @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj
-.readthedocs.yaml @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj
+docs/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli
+*.md @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli
+*.rst @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli
+.readthedocs.yaml @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli
 # Header directory for Doxygen documentation
-library/include/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj
+library/include/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli
diff --git a/.gitignore b/.gitignore
index f4d5ff7abd..599ef99e35 100644
--- a/.gitignore
+++ b/.gitignore
@@ -55,6 +55,8 @@ _static/
 _templates/
 _toc.yml
 _doxygen/
+docs/doxygen/html
+docs/doxygen/xml
 
 # JetBrains IDE
 .idea/
diff --git a/CHANGELOG.md b/CHANGELOG.md
index dec6334cf5..e0ec214c69 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,42 @@
 
 Documentation for Composable Kernel available at [https://rocm.docs.amd.com/projects/composable_kernel/en/latest/](https://rocm.docs.amd.com/projects/composable_kernel/en/latest/).
 
+## Composable Kernel 1.1.0 for ROCm 6.5.0
+
+### Added
+
+* Added support for bf16, f32, and f16 for 2D and 3D NGCHW grouped convolution backward data
+* Added a fully asynchronous HOST (CPU) arguments copy flow for CK grouped GEMM kernels.
+* Added support GKCYX layout for grouped convolution forward (NGCHW/GKCYX/NGKHW, number of instances in instance factory for NGCHW/GKYXC/NGKHW has been reduced).
+* Added support for GKCYX layout for grouped convolution forward (NGCHW/GKCYX/NGKHW).
+* Added support for GKCYX layout for grouped convolution backward weight (NGCHW/GKCYX/NGKHW).
+* Added support for GKCYX layout for grouped convolution backward data (NGCHW/GKCYX/NGKHW).
+* Added support for Stream-K version of mixed fp8/bf16 GEMM
+* Added GEMM pipeline for microscaling (MX) data types
+* Added support for FP16 2:4 structured sparsity to universal GEMM.
+* Added support for Split K for grouped convolution backward data.
+
+### Optimized
+
+None
+
+### Fixes
+
+None
+
+### Changes
+
+* Removed support for gfx940 and gfx941 targets (#1944)
+* Replaced the raw buffer load/store intrinsics with Clang20 built-ins (#1876)
+* DL and DPP kernels are now enabled by default.
+* Number of instances in instance factory for grouped convolution forward NGCHW/GKYXC/NGKHW has been reduced.
+* Number of instances in instance factory for grouped convolution backward weight NGCHW/GKYXC/NGKHW has been reduced.
+* Number of instances in instance factory for grouped convolution backward data NGCHW/GKYXC/NGKHW has been reduced.
+
+### Known issues
+
+None
+
 ## Composable Kernel 1.1.0 for ROCm 6.1.0
 
 ### Additions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e90f893de0..4e12462a41 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -94,12 +94,14 @@ add_compile_options(-Wno-pass-failed)
 add_compile_options(-Wno-switch-default)
 add_compile_options(-Wno-unique-object-duplication)
 
-if(DL_KERNELS)
+if(NOT DISABLE_DL_KERNELS)
     add_definitions(-DDL_KERNELS)
+    set(DL_KERNELS "ON")
     set(CK_ENABLE_DL_KERNELS "ON")
 endif()
-if(DPP_KERNELS)
+if(NOT DISABLE_DPP_KERNELS)
     add_definitions(-DDPP_KERNELS)
+    set(DPP_KERNELS "ON")
     set(CK_ENABLE_DPP_KERNELS "ON")
 endif()
 option(CK_USE_CODEGEN "Enable codegen library" OFF)
@@ -165,8 +167,10 @@ if(NOT ENABLE_ASAN_PACKAGING)
     if(NOT WIN32 AND ${hip_VERSION_FLAT} LESS 600300000)
         # WORKAROUND: compiler does not yet fully support gfx12 targets, need to fix version above
         set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102")
-    else()
+    elseif(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER_EQUAL 600300000 AND ${hip_VERSION_FLAT} LESS 600400000)
         set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201")
+    elseif(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER_EQUAL 600400000)
+        set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950")
     endif()
 else()
     #build CK only for xnack-supported targets when using ASAN
@@ -198,18 +202,20 @@ if (SUPPORTED_GPU_TARGETS MATCHES "gfx9")
     set(CK_USE_XDL "ON")
 endif()
 if (SUPPORTED_GPU_TARGETS MATCHES "gfx94" OR SUPPORTED_GPU_TARGETS MATCHES "gfx95")
-    message("Enabling FP8 gemms on native architectures")
+    message("Enabling XDL FP8 gemms on native architectures")
     add_definitions(-DCK_USE_GFX94)
     set(CK_USE_GFX94 "ON")
 endif()
-if (SUPPORTED_GPU_TARGETS MATCHES "gfx95")
-	add_definitions(-DCK_USE_AMD_MFMA_GFX950)
-endif()
 if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12")
     message("Enabling WMMA instances")
     add_definitions(-DCK_USE_WMMA)
     set(CK_USE_WMMA "ON")
 endif()
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx12")
+    message("Enabling WMMA FP8 gemms on native architectures")
+    add_definitions(-DCK_USE_WMMA_FP8)
+    set(CK_USE_WMMA_FP8 "ON")
+endif()
 if (SUPPORTED_GPU_TARGETS MATCHES "gfx12" OR SUPPORTED_GPU_TARGETS MATCHES "gfx950")
     add_definitions(-DCK_USE_OCP_FP8)
     set(CK_USE_OCP_FP8 "ON")
@@ -609,6 +615,7 @@ if(NOT GPU_ARCHS AND USER_GPU_TARGETS)
         PACKAGE_NAME examples
    )
    add_subdirectory(example)
+   add_subdirectory(tile_engine)
    if(BUILD_TESTING)
        add_subdirectory(test)
    endif()
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 8ef5c2b726..0900b7a1f8 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -20,10 +20,11 @@ Tejash Shah, 2019-2020
 Xiaoyan Zhou, 2020
 
 [Jianfeng Yan](https://github.com/j4yan), 2021-2022
-
+[Jun Liu](https://github.com/junliume), 2021-2024
 
 ## Product Manager
-[Jun Liu](https://github.com/junliume)
+[John Afaganis](https://github.com/afagaj)
+
 
 
 ## Contributors
diff --git a/Dockerfile b/Dockerfile
index 2873a8500b..c629bd034c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,6 +1,6 @@
-FROM ubuntu:22.04
+FROM ubuntu:24.04
 ARG DEBIAN_FRONTEND=noninteractive
-ARG ROCMVERSION=6.3
+ARG ROCMVERSION=6.4
 ARG compiler_version=""
 ARG compiler_commit=""
 ARG CK_SCCACHE=""
@@ -9,19 +9,18 @@ ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
 
 # Add rocm repository
 RUN set -xe && \
-    useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins && \
     apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl && \
     curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
 
-RUN if [ "$ROCMVERSION" != "6.4" ]; then \
-        sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/focal/amdgpu-install_6.3.60300-1_all.deb  --no-check-certificate" && \
-        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.3.60300-1_all.deb && \
+RUN if [ "$ROCMVERSION" != "6.5" ]; then \
+        sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/jammy/amdgpu-install_6.4.60400-1_all.deb  --no-check-certificate" && \
+        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.4.60400-1_all.deb && \
         wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
-        sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
-        sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
+        sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO jammy main > /etc/apt/sources.list.d/rocm.list" && \
+        sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu jammy main > /etc/apt/sources.list.d/amdgpu.list'; \
     fi
 
-RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" && \
+RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu jammy main universe | tee -a /etc/apt/sources.list" && \
     amdgpu-install -y --usecase=rocm --no-dkms
 
 ## Sccache binary built from source for ROCm, only install if CK_SCCACHE is defined
@@ -44,17 +43,13 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
     iputils-ping \
     jq \
     libelf-dev \
-    libncurses5-dev \
     libnuma-dev \
     libpthread-stubs0-dev \
     llvm-amdgpu \
     mpich \
     net-tools \
     pkg-config \
-    python \
-    python3 \
-    python3-dev \
-    python3-pip \
+    python3-full \
     redis \
     rocm-llvm-dev \
     sshpass \
@@ -74,10 +69,8 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
 # Remove unnecessary rocm components that take a lot of space
     apt-get remove -y rocblas rocfft rocsparse composablekernel-dev hipblaslt
 
-# Update the cmake to version 3.27.5
-RUN pip install --upgrade cmake==3.27.5 && \
 #Install latest ccache
-    git clone https://github.com/ccache/ccache.git && \
+RUN git clone https://github.com/ccache/ccache.git && \
     cd ccache && mkdir build && cd build && cmake .. && make install && \
 #Install ninja build tracing tools
     cd / && \
@@ -85,6 +78,11 @@ RUN pip install --upgrade cmake==3.27.5 && \
     gunzip /usr/local/bin/ninja.gz && \
     chmod a+x /usr/local/bin/ninja && \
     git clone https://github.com/nico/ninjatracing.git && \
+#Install ClangBuildAnalyzer
+    git clone https://github.com/aras-p/ClangBuildAnalyzer.git && \
+    cd ClangBuildAnalyzer/ && \
+    make -f projects/make/Makefile && \
+    cd / && \
 #Install latest cppcheck
     git clone https://github.com/danmar/cppcheck.git && \
     cd cppcheck && mkdir build && cd build && cmake .. && cmake --build . && \
@@ -93,8 +91,7 @@ RUN pip install --upgrade cmake==3.27.5 && \
     wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb && \
     dpkg -i dumb-init_*.deb && rm dumb-init_*.deb && \
 # Install packages for processing the performance results
-    pip3 install --upgrade pip && \
-    pip3 install --upgrade pytest sqlalchemy==2.0.36 pymysql pandas==2.2.3 setuptools-rust setuptools>=75 sshtunnel==0.4.0 && \
+    pip3 install --break-system-packages --upgrade pytest pymysql pandas==2.2.3 sqlalchemy==2.0.3 setuptools-rust setuptools sshtunnel==0.4.0 && \
 # Add render group
     groupadd -f render && \
 # Install the new rocm-cmake version
diff --git a/Dockerfile.compiler b/Dockerfile.compiler
index a22103b96b..7534910681 100644
--- a/Dockerfile.compiler
+++ b/Dockerfile.compiler
@@ -1,4 +1,4 @@
-ARG BASE_DOCKER="rocm/composable_kernel:ck_ub22.04_rocm6.3"
+ARG BASE_DOCKER="rocm/composable_kernel:ck_ub24.04_rocm6.4"
 FROM $BASE_DOCKER
 ARG compiler_version=""
 ARG compiler_commit=""
diff --git a/Jenkinsfile b/Jenkinsfile
index 80392bfbed..3e22eb2f01 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -39,11 +39,11 @@ def getBaseDockerImageName(){
     }
     else{
         def ROCM_numeric = "${params.ROCMVERSION}" as float
-        if ( ROCM_numeric < 6.4 ){
-            img = "${env.CK_DOCKERHUB}:ck_ub22.04_rocm${params.ROCMVERSION}"
+        if ( ROCM_numeric < 6.5 ){
+            img = "${env.CK_DOCKERHUB}:ck_ub24.04_rocm${params.ROCMVERSION}"
             }
         else{
-            img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub22.04_rocm${params.ROCMVERSION}"
+            img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm${params.ROCMVERSION}"
             }
         }
     return img
@@ -199,8 +199,8 @@ def cmake_build(Map conf=[:]){
     } else{
         setup_args = ' -DBUILD_DEV=On' + setup_args
     }
-    if (params.DL_KERNELS){
-        setup_args = setup_args + " -DDL_KERNELS=ON "
+    if (params.DISABLE_DL_KERNELS){
+        setup_args = setup_args + " -DDISABLE_DL_KERNELS=ON "
     }
 
     if(build_type_debug){
@@ -229,8 +229,11 @@ def cmake_build(Map conf=[:]){
     if (setup_args.contains("gfx10")){
         invocation_tag="gfx10"
     }
-    if (setup_args.contains("gfx90")){
-        invocation_tag="gfx90"
+    if (setup_args.contains("gfx908")){
+        invocation_tag="gfx908"
+    }
+    if (setup_args.contains("gfx90a")){
+        invocation_tag="gfx90a"
     }
     if (setup_args.contains("gfx94")){
         invocation_tag="gfx94"
@@ -285,7 +288,7 @@ def cmake_build(Map conf=[:]){
     if(!setup_args.contains("NO_CK_BUILD")){
         if (setup_args.contains("gfx90a") && params.NINJA_BUILD_TRACE){
             echo "running ninja build trace"
-            setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake -G Ninja ${setup_args}   .. ")
+            setup_cmd = conf.get("setup_cmd", """${cmake_envs} cmake -G Ninja ${setup_args} -DCMAKE_CXX_FLAGS=" -O3 -ftime-trace "  .. """)
             build_cmd = conf.get("build_cmd", "${build_envs} ninja -j${nt} ${config_targets}")
         }
         else{
@@ -313,11 +316,20 @@ def cmake_build(Map conf=[:]){
         if(!setup_args.contains("NO_CK_BUILD") && !params.BUILD_LEGACY_OS){
             if (setup_args.contains("gfx90a") && params.NINJA_BUILD_TRACE){
                 sh "/ninjatracing/ninjatracing .ninja_log > ck_build_trace.json"
+                sh "/ClangBuildAnalyzer/build/ClangBuildAnalyzer  --all . clang_build.log"
+                sh "/ClangBuildAnalyzer/build/ClangBuildAnalyzer  --analyze clang_build.log > clang_build_analysis.log"
                 archiveArtifacts "ck_build_trace.json"
-                sh "ninja test"
+                archiveArtifacts "clang_build_analysis.log"
+                // do not run unit tests when building instances only
+                if(!params.BUILD_INSTANCES_ONLY){
+                    sh "ninja test"
+                }
             }
             else{
-                sh "make check"
+                // run unit tests unless building library for all targets
+                if (!params.BUILD_INSTANCES_ONLY){
+                    sh "make check"
+                }
             }
         }
     }
@@ -351,12 +363,12 @@ def cmake_build(Map conf=[:]){
     }
     if (params.RUN_CK_TILE_GEMM_TESTS){
         try{
-            archiveArtifacts "perf_tile_gemm_*.log"
+            archiveArtifacts "perf_tile_gemm_**.log"
             if (arch_type == 1){
-                stash includes: "perf_tile_gemm_**_fp16_gfx90a.log", name: "perf_tile_gemm_log_gfx90a"
+                stash includes: "perf_tile_gemm_**_gfx90a.log", name: "perf_tile_gemm_log_gfx90a"
             }
             else if (arch_type == 2){
-                stash includes: "perf_tile_gemm_**_fp16_gfx942.log", name: "perf_tile_gemm_log_gfx942"
+                stash includes: "perf_tile_gemm_**_gfx942.log", name: "perf_tile_gemm_log_gfx942"
             }
         }
         catch(Exception err){
@@ -383,7 +395,7 @@ def buildHipClangJob(Map conf=[:]){
         def prefixpath = conf.get("prefixpath", "/opt/rocm")
 
         // Jenkins is complaining about the render group 
-        def dockerOpts="-u root --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+        def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
         if (conf.get("enforce_xnack_on", false)) {
             dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
         }
@@ -452,7 +464,7 @@ def Build_CK(Map conf=[:]){
         def prefixpath = conf.get("prefixpath", "/opt/rocm")
 
         // Jenkins is complaining about the render group 
-        def dockerOpts="-u root --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+        def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
         if (conf.get("enforce_xnack_on", false)) {
             dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
         }
@@ -502,21 +514,27 @@ def Build_CK(Map conf=[:]){
                     else if ( runShell('grep -n "gfx942" rocminfo.log') ) {
                         arch_type = 2
                     }
-                    else if ( runShell('grep -n "gfx1030" rocminfo.log') ) {
+                    else if ( runShell('grep -n "gfx10" rocminfo.log') ) {
                         arch_type = 3
                     }
-                    else if ( runShell('grep -n "gfx1101" rocminfo.log') ) {
+                    else if ( runShell('grep -n "gfx11" rocminfo.log') ) {
                         arch_type = 4
                     }
-                    else if ( runShell('grep -n "gfx1201" rocminfo.log') ) {
+                    else if ( runShell('grep -n "gfx12" rocminfo.log') ) {
                         arch_type = 5
                     }
+                    else if ( runShell('grep -n "gfx908" rocminfo.log') ) {
+                        arch_type = 6
+                    }
                     cmake_build(conf)
-                    if ( !params.BUILD_LEGACY_OS && arch_type == 1 ){
+                    if ( params.RUN_INDUCTOR_TESTS && !params.BUILD_LEGACY_OS && arch_type == 1 ){
                             echo "Run inductor codegen tests"
                             sh """
-                                  pip install --verbose .
-                                  pytest python/test/test_gen_instances.py
+                                  python3 -m venv ${env.WORKSPACE}
+                                  . ${env.WORKSPACE}/bin/activate
+                                  python3 -m pip install pytest build setuptools setuptools_scm
+                                  python3 -m pip install .
+                                  python3 -m pytest python/test/test_gen_instances.py
                             """
                     }
                     dir("build"){
@@ -582,7 +600,14 @@ def Build_CK(Map conf=[:]){
                             sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx12"
                             archiveArtifacts "perf_onnx_gemm_gfx12.log"
                             stash includes: "perf_onnx_gemm_gfx12.log", name: "perf_log_gfx12"
-                        }                        
+                        }
+                        else if ( arch_type == 6 ){
+                            // run basic tests on gfx908
+                            echo "Run performance tests"
+                            sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx908"
+                            archiveArtifacts "perf_onnx_gemm_gfx908.log"
+                            stash includes: "perf_onnx_gemm_gfx908.log", name: "perf_log_gfx908"
+                        }
                         }
                     }
                     if (params.hipTensor_test && arch_type == 1 ){
@@ -713,12 +738,12 @@ def process_results(Map conf=[:]){
 }
 
 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.3;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true
-                                              0 21 * * * % ROCMVERSION=6.3;hipTensor_test=true;RUN_CODEGEN_TESTS=true
-                                              0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
-                                              0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;ROCMVERSION=6.4;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true
+                                              0 21 * * * % ROCMVERSION=6.4;hipTensor_test=true;RUN_CODEGEN_TESTS=true;BUILD_GFX908=true
+                                              0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
+                                              0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
                                               0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false
-                                              0 13 * * * % BUILD_LEGACY_OS=true''' : ""
+                                              0 13 * * * % BUILD_LEGACY_OS=true;USE_SCCACHE=false;RUN_PERFORMANCE_TESTS=false''' : ""
 
 pipeline {
     agent none
@@ -739,7 +764,7 @@ pipeline {
             description: 'If you want to use a custom docker image, please specify it here (default: leave blank).')
         string(
             name: 'ROCMVERSION', 
-            defaultValue: '6.3',
+            defaultValue: '6.4',
             description: 'Specify which ROCM version to use: 6.3 (default).')
         string(
             name: 'COMPILER_VERSION', 
@@ -758,7 +783,7 @@ pipeline {
             defaultValue: false,
             description: "Select whether to run small set of performance tests (default) or full QA")
         booleanParam(
-            name: "DL_KERNELS",
+            name: "DISABLE_DL_KERNELS",
             defaultValue: false,
             description: "Select whether to build DL kernels (default: OFF)")
         booleanParam(
@@ -795,12 +820,16 @@ pipeline {
             description: "Run the ck_tile FMHA tests (default: OFF)")
         booleanParam(
             name: "RUN_CK_TILE_GEMM_TESTS",
-            defaultValue: true,
-            description: "Run the ck_tile GEMM tests (default: ON)")
+            defaultValue: false,
+            description: "Run the ck_tile GEMM tests (default: OFF)")
         booleanParam(
             name: "BUILD_INSTANCES_ONLY",
             defaultValue: false,
             description: "Test building instances for various architectures simultaneously (default: OFF)")
+        booleanParam(
+            name: "BUILD_GFX908",
+            defaultValue: false,
+            description: "Build CK and run tests on gfx908 (default: OFF)")
         booleanParam(
             name: "BUILD_GFX12",
             defaultValue: true,
@@ -813,6 +842,10 @@ pipeline {
             name: "BUILD_LEGACY_OS",
             defaultValue: false,
             description: "Try building CK with legacy OS dockers: RHEL8 and SLES15 (default: OFF)")
+        booleanParam(
+            name: "RUN_INDUCTOR_TESTS",
+            defaultValue: false,
+            description: "Run inductor codegen tests (default: OFF)")
     }
     environment{
         dbuser = "${dbuser}"
@@ -857,8 +890,8 @@ pipeline {
                                 | grep -v 'build/' \
                                 | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-12 -style=file {} | diff - {}\' && \
                                 /cppcheck/build/bin/cppcheck ../* -v -j \$(nproc) -I ../include -I ../profiler/include -I ../library/include \
-                                -D CK_ENABLE_FP64 -D CK_ENABLE_FP32 -D CK_ENABLE_FP16 -D CK_ENABLE_FP8 -D CK_ENABLE_BF16 -D CK_ENABLE_BF8 -D CK_ENABLE_INT8 -D DL_KERNELS \
-                                -D __gfx908__ -D __gfx90a__ -D __gfx940__ -D __gfx941__ -D __gfx942__ -D __gfx1030__ -D __gfx1100__ -D __gfx1101__ -D __gfx1102__ \
+                                -D CK_ENABLE_FP64 -D CK_ENABLE_FP32 -D CK_ENABLE_FP16 -D CK_ENABLE_FP8 -D CK_ENABLE_BF16 -D CK_ENABLE_BF8 -D CK_ENABLE_INT8 \
+                                -D __gfx908__ -D __gfx90a__ -D __gfx942__ -D __gfx1030__ -D __gfx1100__ -D __gfx1101__ -D __gfx1102__ \
                                 -U __gfx803__ -U __gfx900__ -U __gfx906__ -U CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 \
                                 --file-filter=*.cpp --force --enable=all --output-file=ck_cppcheck.log"
                     }
@@ -907,8 +940,8 @@ pipeline {
                     environment{
                         setup_args = "NO_CK_BUILD"
                         execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx90a && \
-                                           make -j64 test_grouped_convnd_fwd_large_cases_xdl && \
-                                           ./bin/test_grouped_convnd_fwd_large_cases_xdl"""
+                                           make -j64 test_grouped_convnd_fwd_large_cases_xdl test_grouped_convnd_bwd_data_xdl_large_cases && \
+                                           ./bin/test_grouped_convnd_fwd_large_cases_xdl && ./bin/test_grouped_convnd_bwd_data_xdl_large_cases"""
                     }
                     steps{
                         buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
@@ -998,7 +1031,7 @@ pipeline {
                     environment{
                         setup_args = "NO_CK_BUILD"
                         execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx90a && \
-                                           make -j64 tile_example_gemm_basic tile_example_gemm_universal && \
+                                           make -j64 tile_example_gemm_universal && \
                                            cd ../ &&
                                            example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
                     }
@@ -1017,7 +1050,7 @@ pipeline {
                     environment{
                         setup_args = "NO_CK_BUILD"
                         execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx942 && \
-                                           make -j64 tile_example_gemm_basic tile_example_gemm_universal && \
+                                           make -j64 tile_example_gemm_universal && \
                                            cd ../ &&
                                            example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
                     }
@@ -1077,14 +1110,14 @@ pipeline {
                         beforeAgent true
                         expression { params.RUN_FULL_QA.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                     }
-                    agent{ label rocmnode("gfx90a") }
+                    agent{ label rocmnode("gfx942") }
                     environment{
                         setup_args = """ -DCMAKE_INSTALL_PREFIX=../install \
-                                         -DGPU_TARGETS="gfx908;gfx90a;gfx942" \
+                                         -DGPU_TARGETS="gfx90a;gfx942" \
                                          -DCMAKE_CXX_FLAGS=" -O3 " """
                         execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
                                            cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
-                                           -DGPU_TARGETS="gfx908;gfx90a;gfx942" \
+                                           -DGPU_TARGETS="gfx90a;gfx942" \
                                            -DCMAKE_CXX_COMPILER="${build_compiler()}" \
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                     }
@@ -1093,18 +1126,18 @@ pipeline {
                         cleanWs()
                     }
                 }
-                stage("Build CK and run Tests on gfx942")
+                stage("Build CK and run Tests on gfx908")
                 {
                     when {
                         beforeAgent true
-                        expression { params.RUN_FULL_QA.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
+                        expression { params.BUILD_GFX908.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                     }
-                    agent{ label rocmnode("gfx942") }
+                    agent{ label rocmnode("gfx908") }
                     environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx942" -DCMAKE_CXX_FLAGS=" -O3 " """
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908" -DCMAKE_CXX_FLAGS=" -O3 " """
                         execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
                                            cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
-                                           -DGPU_TARGETS="gfx942" \
+                                           -DGPU_TARGETS="gfx908" \
                                            -DCMAKE_CXX_COMPILER="${build_compiler()}" \
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                     }
@@ -1139,13 +1172,13 @@ pipeline {
                         beforeAgent true
                         expression { params.BUILD_INSTANCES_ONLY.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                     }
-                    agent{ label rocmnode("gfx90a") }
+                    agent{ label rocmnode("gfx942") }
                     environment{
-                        execute_args = """ cmake -D CMAKE_PREFIX_PATH=/opt/rocm \
+                        execute_args = """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \
                                            -D CMAKE_CXX_COMPILER="${build_compiler()}" \
                                            -D CMAKE_BUILD_TYPE=Release \
-                                           -D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102"  \
-                                           -D CMAKE_CXX_FLAGS=" -O3 " .. && make -j64 """
+                                           -D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1151;gfx1201"  \
+                                           -D CMAKE_CXX_FLAGS=" -O3 " .. && ninja -j64 """
                     }
                     steps{
                         buildHipClangJobAndReboot(setup_cmd: "",  build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
@@ -1160,7 +1193,7 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx1030") }
                     environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """ 
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DCMAKE_CXX_FLAGS=" -O3 " """ 
                         execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
                                            cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
                                            -DGPU_TARGETS="gfx1030" \
@@ -1180,7 +1213,7 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx1101") }
                     environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1101" -DCMAKE_CXX_FLAGS=" -O3 " """
                         execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
                                            cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
                                            -DGPU_TARGETS="gfx1101" \
@@ -1200,7 +1233,7 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx1201") }
                     environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1201" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1201" -DCMAKE_CXX_FLAGS=" -O3 " """
                         execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
                                            cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
                                            -DGPU_TARGETS="gfx1201" \
diff --git a/README.md b/README.md
index 95f44d8872..29d3d4e85a 100644
--- a/README.md
+++ b/README.md
@@ -73,7 +73,7 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa
 
     You must set the `GPU_TARGETS` macro to specify the GPU target architecture(s) you want
     to run CK on. You can specify single or multiple architectures. If you specify multiple architectures,
-    use a semicolon between each; for example, `gfx908;gfx90a;gfx940`.
+    use a semicolon between each; for example, `gfx908;gfx90a;gfx942`.
 
     ```bash
     cmake                                                                                             \
@@ -104,6 +104,7 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa
     ```bash
     make -j install
     ```
+    **[See Note on -j](#notes)**
 
 ## Optional post-install steps
 
@@ -146,7 +147,8 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa
     python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
     ```
 
-Note the `-j` option for building with multiple threads in parallel, which speeds up the build significantly.
+### Notes
+The `-j` option for building with multiple threads in parallel, which speeds up the build significantly.
 However, `-j` launches unlimited number of threads, which can cause the build to run out of memory and
 crash. On average, you should expect each thread to use ~2Gb of RAM.
 Depending on the number of CPU cores and the amount of RAM on your system, you may want to
@@ -158,12 +160,12 @@ Additional cmake flags can be used to significantly speed-up the build:
   instances of select data types only. The main default data types are fp32 and fp16; you can safely skip
   other data types.
 
-* `DL_KERNELS` (default is OFF) must be set to ON in order to build instances, such as `gemm_dl` or
+* `DISABLE_DL_KERNELS` (default is OFF) must be set to ON in order not to build instances, such as `gemm_dl` or
   `batched_gemm_multi_d_dl`. These instances are useful on architectures like the NAVI2x, as most
   other platforms have faster instances, such as `xdl` or `wmma`, available.
 
-* `DPP_KERNELS` (default is OFF) must be set to ON in order to build instances, such as `gemm_dpp`. 
-  These instances are useful on architectures like the NAVI2x, as most other platforms have faster instances, such as `xdl` or `wmma`, available.
+* `DISABLE_DPP_KERNELS` (default is OFF) must be set to ON in order not to build instances, such as `gemm_dpp`. 
+  These instances offer a slightly better performance of fp16 gemms on NAVI2x. But on other architectures faster alternatives are available.
 
 * `CK_USE_FP8_ON_UNSUPPORTED_ARCH` (default is OFF) must be set to ON in order to build instances,
   such as `gemm_universal`, `gemm_universal_streamk` and `gemm_multiply_multiply` for fp8 data type for GPU targets which do not  have native support for fp8 data type, such as gfx908 or gfx90a. These instances are useful on
@@ -211,4 +213,4 @@ script/uninstall_precommit.sh
 ```
 
 If you need to temporarily disable pre-commit hooks, you can add the `--no-verify` option to the
-`git commit` command.
+`git commit` command.
\ No newline at end of file
diff --git a/client_example/07_grouped_convnd_fwd/README.md b/client_example/07_grouped_convnd_fwd/README.md
index 28a64ad733..9e96df222d 100644
--- a/client_example/07_grouped_convnd_fwd/README.md
+++ b/client_example/07_grouped_convnd_fwd/README.md
@@ -30,14 +30,14 @@ List of the device operations for grouped convolution forward in CK:
 
 Table of supported cases by instance factory with XDL instruction:
 
-|       |NHWGC/GKYXC/NHWGK|NGCHW/GKYXC/NGKHW|GNHWC/GKYXC/GNHWK|
-|-------|---|---|---|
-|bf16 |2D, 3D|2D|1D, 2D, 3D|
-|fp16 |2D, 3D|2D|1D, 2D, 3D|
-|fp32 |2D, 3D|2D|1D, 2D, 3D|
-|int8 |2D, 3D|2D|1D, 3D|
-|fp8  |3D|&cross;|&cross;|
-|bf8  |3D|&cross;|&cross;|
+|       |NHWGC/GKYXC/NHWGK|NGCHW/GKYXC/NGKHW|NGCHW/GKCYX/NGKHW|GNHWC/GKYXC/GNHWK|
+|-------|---|---|---|---|
+|bf16 |2D, 3D|2D|2D|1D, 2D, 3D|
+|fp16 |2D, 3D|2D|2D|1D, 2D, 3D|
+|fp32 |2D, 3D|2D|2D|1D, 2D, 3D|
+|int8 |2D, 3D|2D|2D|1D, 3D|
+|fp8  |3D|&cross;|&cross;|&cross;|
+|bf8  |3D|&cross;|&cross;|&cross;|
 
 Table of supported cases by instance factory with WMMA instruction:
 
diff --git a/client_example/10_grouped_convnd_bwd_data/CMakeLists.txt b/client_example/10_grouped_convnd_bwd_data/CMakeLists.txt
index d10c39ed80..42a29a1d42 100644
--- a/client_example/10_grouped_convnd_bwd_data/CMakeLists.txt
+++ b/client_example/10_grouped_convnd_bwd_data/CMakeLists.txt
@@ -1,6 +1,9 @@
 add_executable(client_grouped_conv2d_bwd_data grouped_conv2d_bwd_data.cpp)
 target_link_libraries(client_grouped_conv2d_bwd_data PRIVATE composable_kernel::device_conv_operations)
 
+add_executable(client_grouped_conv2d_bwd_data_ngchw grouped_conv2d_bwd_data_ngchw.cpp)
+target_link_libraries(client_grouped_conv2d_bwd_data_ngchw PRIVATE composable_kernel::device_conv_operations)
+
 add_executable(client_grouped_conv3d_bwd_data grouped_conv3d_bwd_data.cpp)
 target_link_libraries(client_grouped_conv3d_bwd_data PRIVATE composable_kernel::device_conv_operations)
 
diff --git a/client_example/10_grouped_convnd_bwd_data/README.md b/client_example/10_grouped_convnd_bwd_data/README.md
index 0ed133310e..e26fc3516e 100644
--- a/client_example/10_grouped_convnd_bwd_data/README.md
+++ b/client_example/10_grouped_convnd_bwd_data/README.md
@@ -31,9 +31,9 @@ Table of supported cases by instance factory with XDL instruction:
 
 |       |NHWGC/GKYXC/NHWGK|NGCHW/GKYXC/NGKHW|GNHWC/GKYXC/GNHWK|
 |-------|---|---|---|
-|bf16|2D, 3D|&cross;|2D, 3D|
-|fp16 |2D, 3D|&cross;|2D, 3D|
-|fp32  |2D, 3D|&cross;|2D, 3D|
+|bf16|2D, 3D|2D, 3D|2D, 3D|
+|fp16 |2D, 3D|2D, 3D|2D, 3D|
+|fp32  |2D, 3D|2D, 3D|2D, 3D|
 
 Table of supported cases by instance factory with WMMA instruction:
 
diff --git a/client_example/10_grouped_convnd_bwd_data/grouped_conv2d_bwd_data_ngchw.cpp b/client_example/10_grouped_convnd_bwd_data/grouped_conv2d_bwd_data_ngchw.cpp
new file mode 100644
index 0000000000..2309d757f0
--- /dev/null
+++ b/client_example/10_grouped_convnd_bwd_data/grouped_conv2d_bwd_data_ngchw.cpp
@@ -0,0 +1,205 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+
+using InLayout    = ck::tensor_layout::convolution::NGCHW;
+using WeiLayout   = ck::tensor_layout::convolution::GKYXC;
+using OutLayout   = ck::tensor_layout::convolution::NGKHW;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr ck::index_t NumDimSpatial = 2;
+static constexpr ck::index_t G             = 32;
+static constexpr ck::index_t N             = 256;
+static constexpr ck::index_t K             = 192;
+static constexpr ck::index_t C             = 192;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Hi            = 28;
+static constexpr ck::index_t Wi            = 28;
+static constexpr ck::index_t Ho            = 28;
+static constexpr ck::index_t Wo            = 28;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main()
+{
+    std::array<ck::index_t, NumDimSpatial + 3> in_lengths{G, N, Hi, Wi, C};
+    std::array<ck::index_t, NumDimSpatial + 3> in_strides{
+        C * Hi * Wi, G * C * Hi * Wi, Wi, 1, Hi * Wi};
+
+    std::array<ck::index_t, NumDimSpatial + 3> wei_lengths{G, K, Y, X, C};
+    std::array<ck::index_t, NumDimSpatial + 3> wei_strides{K * Y * X * C, Y * X * C, X * C, C, 1};
+
+    std::array<ck::index_t, NumDimSpatial + 3> out_lengths{G, N, Ho, Wo, K};
+    std::array<ck::index_t, NumDimSpatial + 3> out_strides{
+        K * Ho * Wo, G * K * Ho * Wo, Wo, 1, Ho * Wo};
+
+    std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
+    std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
+    std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
+    std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
+
+    SimpleDeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+    SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K);
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD<NumDimSpatial,
+                                                                                     OutLayout,
+                                                                                     WeiLayout,
+                                                                                     ck::Tuple<>,
+                                                                                     InLayout,
+                                                                                     OutDataType,
+                                                                                     WeiDataType,
+                                                                                     ck::Tuple<>,
+                                                                                     InDataType,
+                                                                                     PassThrough,
+                                                                                     PassThrough,
+                                                                                     PassThrough>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    float best_tflops     = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr        = op_ptrs[i];
+        auto argument_ptr   = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        {},
+                                                        in.GetDeviceBuffer(),
+                                                        out_lengths,
+                                                        out_strides,
+                                                        wei_lengths,
+                                                        wei_strides,
+                                                        {},
+                                                        {},
+                                                        in_lengths,
+                                                        in_strides,
+                                                        filter_strides,
+                                                        filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        PassThrough{});
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+        SimpleDeviceMem workspace_dev(workspace_sz);
+        op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t flop      = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X;
+            std::size_t num_bytes = sizeof(InDataType) * G * N * Hi * Wi * C +
+                                    sizeof(WeiDataType) * G * K * Y * X * C +
+                                    sizeof(OutDataType) * G * N * Ho * Wo * K;
+
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+                best_tflops     = tflops;
+            }
+        }
+        else
+        {
+            std::cerr << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    if(best_op_id < 0)
+    {
+        std::cerr << "no suitable instance" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+              << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        {},
+                                                        in.GetDeviceBuffer(),
+                                                        out_lengths,
+                                                        out_strides,
+                                                        wei_lengths,
+                                                        wei_strides,
+                                                        {},
+                                                        {},
+                                                        in_lengths,
+                                                        in_strides,
+                                                        filter_strides,
+                                                        filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        PassThrough{});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+}
diff --git a/client_example/11_grouped_conv_bwd_weight/README.md b/client_example/11_grouped_conv_bwd_weight/README.md
index ed3dff0f1e..f1ba95e9cd 100644
--- a/client_example/11_grouped_conv_bwd_weight/README.md
+++ b/client_example/11_grouped_conv_bwd_weight/README.md
@@ -34,12 +34,12 @@ List of the device operations for grouped convolution backward weight in CK:
 
 Table of supported cases by instance factory with XDL instruction:
 
-|       |NHWGC/GKYXC/NHWGK|NGCHW/GKYXC/NGKHW|GNHWC/GKYXC/GNHWK|
-|-------|---|---|---|
-|bf16|2D, 3D|&cross;|&cross;|
-|bf16(fp32 for weight)|2D, 3D|&cross;|1D, 2D, 3D|
-|fp16 |2D, 3D|&cross;|1D, 2D, 3D|
-|fp32  |2D, 3D|&cross;|1D, 2D, 3D|
+|       |NHWGC/GKYXC/NHWGK|NGCHW/GKYXC/NGKHW|NGCHW/GKCYX/NGKHW|GNHWC/GKYXC/GNHWK|
+|-------|---|---|---|---|
+|bf16|2D, 3D|2D, 3D|2D, 3D|&cross;|
+|bf16(fp32 for weight)|2D, 3D|&cross;|&cross;|1D, 2D, 3D|
+|fp16 |2D, 3D|2D, 3D|2D, 3D|1D, 2D, 3D|
+|fp32  |2D, 3D|2D, 3D|2D, 3D|1D, 2D, 3D|
 
 Table of supported cases by instance factory with WMMA instruction:
 
diff --git a/cmake/ClangTidy.cmake b/cmake/ClangTidy.cmake
index cf77991a64..d0d30d669a 100644
--- a/cmake/ClangTidy.cmake
+++ b/cmake/ClangTidy.cmake
@@ -144,7 +144,7 @@ function(clang_tidy_check TARGET)
     # COMMAND ${CLANG_TIDY_COMMAND} $<JOIN:$<TARGET_PROPERTY:${TARGET},SOURCES>, >
     foreach(SOURCE ${SOURCES})
         if((NOT "${SOURCE}" MATCHES "(h|hpp|hxx)$") AND (NOT "${SOURCE}" MATCHES "TARGET_OBJECTS"))
-            string(MAKE_C_IDENTIFIER "${SOURCE}" tidy_file)
+            string(MD5 tidy_file "${SOURCE}")
             set(tidy_target tidy-target-${TARGET}-${tidy_file})
             add_custom_target(${tidy_target}
                 # for some targets clang-tidy not able to get information from .clang-tidy
diff --git a/codegen/include/ck/host/device_batched_gemm_softmax_gemm/problem.hpp b/codegen/include/ck/host/device_batched_gemm_softmax_gemm/problem.hpp
index 428034a3ba..30dd1487ca 100644
--- a/codegen/include/ck/host/device_batched_gemm_softmax_gemm/problem.hpp
+++ b/codegen/include/ck/host/device_batched_gemm_softmax_gemm/problem.hpp
@@ -15,31 +15,32 @@ namespace device_batched_gemm_softmax_gemm {
 // defines the problem specification for a GEMM operation
 struct Problem
 {
-    std::size_t M            = 0;
-    std::size_t N            = 0;
-    std::size_t K            = 0;
-    std::size_t O            = 0;
-    bool TransA              = false;
-    bool TransB              = false;
-    bool TransB1             = false;
-    bool TransC              = false;
-    DataType ADataType       = DataType::Half;
-    DataType BDataType       = DataType::Half;
-    DataType B1DataType      = DataType::Half;
-    DataType CDataType       = DataType::Half;
-    std::string AElementOp   = PassThrough;
-    std::string BElementOp   = PassThrough;
-    std::string B1ElementOp  = PassThrough;
-    std::string CElementOp   = PassThrough;
-    std::string AccElementOp = Scale;
+    std::size_t M             = 0;
+    std::size_t N             = 0;
+    std::size_t K             = 0;
+    std::size_t O             = 0;
+    bool TransA               = false;
+    bool TransB               = false;
+    bool TransB1              = false;
+    bool TransC               = false;
+    DataType ADataType        = DataType::Half;
+    DataType BDataType        = DataType::Half;
+    DataType B1DataType       = DataType::Half;
+    DataType CDataType        = DataType::Half;
+    std::string AElementOp    = PassThrough;
+    std::string BElementOp    = PassThrough;
+    std::string B1ElementOp   = PassThrough;
+    std::string CElementOp    = PassThrough;
+    std::string AccElementOp  = Scale;
+    bool MaskOutUpperTriangle = false;
 
     // returns the correct device op file for the operation
     std::string GetIncludeHeader() const;
 
     // returns a list of instances based on the problem spec and provided fusion operations
     std::vector<Solution> GetSolutions(const std::string& arch,
-                                       const std::string& prologue,
-                                       const std::string& epilogue) const;
+                                       const std::string& prologue = "",
+                                       const std::string& epilogue = "") const;
 };
 
 } // namespace device_batched_gemm_softmax_gemm
diff --git a/codegen/include/ck/host/device_gemm_multiple_d/problem.hpp b/codegen/include/ck/host/device_gemm_multiple_d/problem.hpp
index f4036328ec..1c65fb71ff 100644
--- a/codegen/include/ck/host/device_gemm_multiple_d/problem.hpp
+++ b/codegen/include/ck/host/device_gemm_multiple_d/problem.hpp
@@ -37,8 +37,8 @@ struct Problem
 
     // returns a list of instances based on the problem spec and provided fusion operations
     std::vector<Solution> GetSolutions(const std::string& arch,
-                                       const std::string& prologue,
-                                       const std::string& epilogue) const;
+                                       const std::string& prologue = "",
+                                       const std::string& epilogue = "") const;
 };
 
 } // namespace device_gemm_multiple_d
diff --git a/codegen/src/device_batched_gemm_softmax_gemm_operation_xdl_cshuffle.cpp b/codegen/src/device_batched_gemm_softmax_gemm_operation_xdl_cshuffle.cpp
index b12c2e1a4a..6029ab0c7d 100644
--- a/codegen/src/device_batched_gemm_softmax_gemm_operation_xdl_cshuffle.cpp
+++ b/codegen/src/device_batched_gemm_softmax_gemm_operation_xdl_cshuffle.cpp
@@ -259,10 +259,7 @@ std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(
                                             x.tile_desc.gemm1_n_per_block);
         x.update_prologue(prologue);
         x.update_epilogue(epilogue);
-        x.mask_out_upper_triangle = true;
-        result.push_back(x);
-
-        x.mask_out_upper_triangle = false;
+        x.mask_out_upper_triangle = prob.MaskOutUpperTriangle;
         result.push_back(x);
     }
     return result;
@@ -273,13 +270,20 @@ std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(
 std::vector<std::vector<Operation_Xdl_CShuffle>>
 Operation_Xdl_CShuffle::CreateOperations(const std::string& prologue, const std::string& epilogue)
 {
+    std::vector<Problem> problems;
+
     Problem prob;
     prob.TransA  = false;
     prob.TransB  = true;
     prob.TransB1 = false;
     prob.TransC  = false;
+    problems.push_back(prob);
 
-    return {CreateOperations(prob, prologue, epilogue)};
+    prob.MaskOutUpperTriangle = true;
+    problems.push_back(prob);
+
+    return Transform(problems,
+                     [&](const Problem& p) { return CreateOperations(p, prologue, epilogue); });
 }
 
 static const char* const DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffleTemplate =
diff --git a/codegen/src/utils.cpp b/codegen/src/utils.cpp
index 19627d4cf6..c15a9fd7d3 100644
--- a/codegen/src/utils.cpp
+++ b/codegen/src/utils.cpp
@@ -13,7 +13,7 @@ std::size_t integer_divide_ceil(std::size_t x, std::size_t y)
 
 const std::unordered_set<std::string>& get_xdlop_archs()
 {
-    static std::unordered_set<std::string> supported_archs{"gfx90a", "gfx908", "gfx940", "gfx942"};
+    static std::unordered_set<std::string> supported_archs{"gfx90a", "gfx908", "gfx942"};
     return supported_archs;
 }
 
diff --git a/codegen/test/batched_gemm_softmax_gemm.cpp b/codegen/test/batched_gemm_softmax_gemm.cpp
index 3f0b8bfe6a..13035df355 100644
--- a/codegen/test/batched_gemm_softmax_gemm.cpp
+++ b/codegen/test/batched_gemm_softmax_gemm.cpp
@@ -42,16 +42,13 @@ TEST_CASE(test_problem_kernel)
     prob.K      = 1024;
     prob.O      = 1024;
     prob.TransB = true;
-    check_all<half> check1, check2;
+    check_all<half> check;
     auto a  = to_gpu(generate_buffer<half>(1024 * 1024, 0));
     auto b  = to_gpu(generate_buffer<half>(1024 * 1024, 1));
     auto b1 = to_gpu(generate_buffer<half>(1024 * 1024, 2));
     auto c  = to_gpu(generate_buffer<half>(1024 * 1024, 3));
 
-    std::string epilogue = "";
-    std::string prologue = "";
-
-    auto solutions = prob.GetSolutions("gfx90a", prologue, epilogue);
+    auto solutions = prob.GetSolutions("gfx90a");
     std::cout << "Num solutions: " << solutions.size() << std::endl;
     for(auto i = 0; i < solutions.size(); ++i)
     {
@@ -77,10 +74,8 @@ TEST_CASE(test_problem_kernel)
         k.launch(nullptr, grid_size * block_size, block_size)(
             a.data(), b.data(), b1.data(), c.data());
 
-        if(solution.GetTemplateParameter<bool>("MaskOutUpperTriangle"))
-            CHECK(report(solution, check1(rtc::from_gpu(c))));
-        else
-            CHECK(report(solution, check2(rtc::from_gpu(c))));
+        // NOTE: Solutions where MaskOutUpperTriangle is True don't produce consistent results
+        CHECK(report(solution, check(rtc::from_gpu(c))));
     }
 }
 
diff --git a/codegen/test/gemm_multiple_d.cpp b/codegen/test/gemm_multiple_d.cpp
index 2a383fc1c8..adc8e1ff02 100644
--- a/codegen/test/gemm_multiple_d.cpp
+++ b/codegen/test/gemm_multiple_d.cpp
@@ -53,10 +53,7 @@ TEST_CASE(test_problem_kernel)
     auto b = to_gpu(generate_buffer<half>(1024 * 1024, 1));
     auto c = to_gpu(generate_buffer<half>(1024 * 1024, 2));
 
-    std::string epilogue = "";
-    std::string prologue = "";
-
-    auto solutions = prob.GetSolutions("gfx90a", prologue, epilogue);
+    auto solutions = prob.GetSolutions("gfx90a");
     std::cout << "Num solutions: " << solutions.size() << std::endl;
     for(auto i = 0; i < solutions.size(); ++i)
     {
diff --git a/codegen/test/rtc/src/compile_kernel.cpp b/codegen/test/rtc/src/compile_kernel.cpp
index a8da88be09..262e6bae46 100644
--- a/codegen/test/rtc/src/compile_kernel.cpp
+++ b/codegen/test/rtc/src/compile_kernel.cpp
@@ -279,6 +279,7 @@ static kernel hiprtc_compile_kernel(const std::vector<src_file>& srcs, compile_o
 {
     options.flags += " -I. -O3";
     options.flags += " -std=c++17";
+    options.flags += " -DCK_CODE_GEN_RTC";
     options.flags += " --offload-arch=" + get_device_name();
     auto cos = compile_hip_src_with_hiprtc(srcs, options);
     if(cos.size() != 1)
diff --git a/docs/reference/Supported_Primitives_Guide.rst b/docs/conceptual/Composable-Kernel-math.rst
similarity index 85%
rename from docs/reference/Supported_Primitives_Guide.rst
rename to docs/conceptual/Composable-Kernel-math.rst
index e24acf5656..1c21fd8a11 100644
--- a/docs/reference/Supported_Primitives_Guide.rst
+++ b/docs/conceptual/Composable-Kernel-math.rst
@@ -1,18 +1,15 @@
 .. meta::
-  :description: Composable Kernel documentation and API reference library
-  :keywords: composable kernel, CK, ROCm, API, documentation
+  :description: Composable Kernel mathematical basis
+  :keywords: composable kernel, CK, ROCm, API, mathematics, algorithm
 
 .. _supported-primitives:
 
 ********************************************************************
-Supported Primitives Guide
+Composable Kernel mathematical basis
 ********************************************************************
 
-This document contains details of supported primitives in Composable Kernel (CK). In contrast to the API Reference Guide, the Supported Primitives Guide is an introduction to the math which underpins the algorithms implemented in CK.
+This is an introduction to the math which underpins the algorithms implemented in Composable Kernel.
 
-------------
-Softmax
-------------
 
 For vectors :math:`x^{(1)}, x^{(2)}, \ldots, x^{(T)}` of size :math:`B` you can decompose the
 softmax of concatenated :math:`x = [ x^{(1)}\ | \ \ldots \ | \ x^{(T)} ]` as,
diff --git a/docs/conceptual/Composable-Kernel-structure.rst b/docs/conceptual/Composable-Kernel-structure.rst
new file mode 100644
index 0000000000..43c3603b95
--- /dev/null
+++ b/docs/conceptual/Composable-Kernel-structure.rst
@@ -0,0 +1,29 @@
+.. meta::
+  :description: Composable Kernel structure
+  :keywords: composable kernel, CK, ROCm, API, structure
+
+.. _what-is-ck:
+
+********************************************************************
+Composable Kernel structure
+********************************************************************
+
+The Composable Kernel library uses a tile-based programming model and tensor coordinate transformation to achieve performance portability and code maintainability. Tensor coordinate transformation is a complexity reduction technique for complex machine learning operators.
+  
+
+.. image:: ../data/ck_component.png
+   :alt: CK Components
+
+
+The Composable Kernel library consists of four layers: 
+
+* a templated tile operator layer
+* a templated kernel and invoker layer
+* an instantiated kernel and invoker layer
+* a client API layer.
+
+A wrapper component is included to simplify tensor transform operations.
+
+.. image:: ../data/ck_layer.png
+   :alt: CK Layers
+   
\ No newline at end of file
diff --git a/docs/conceptual/what-is-ck.rst b/docs/conceptual/what-is-ck.rst
deleted file mode 100644
index 36785fc6ca..0000000000
--- a/docs/conceptual/what-is-ck.rst
+++ /dev/null
@@ -1,41 +0,0 @@
-.. meta::
-  :description: Composable Kernel documentation and API reference library
-  :keywords: composable kernel, CK, ROCm, API, documentation
-
-.. _what-is-ck:
-
-********************************************************************
-What is the Composable Kernel library
-********************************************************************
-
-
-Methodology
-===========
-
-The Composable Kernel (CK) library provides a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs and CPUs, through general purpose kernel languages like HIP C++.
-
-CK utilizes two concepts to achieve performance portability and code maintainability:
-
-* A tile-based programming model
-* Algorithm complexity reduction for complex ML operators using an innovative technique called
-  "Tensor Coordinate Transformation".
-
-.. image:: ../data/ck_component.png
-   :alt: CK Components
-
-
-Code Structure
-==============
-
-The CK library is structured into 4 layers:
-
-* "Templated Tile Operators" layer
-* "Templated Kernel and Invoker" layer
-* "Instantiated Kernel and Invoker" layer
-* "Client API" layer
-
-It also includes a simple wrapper component used to perform tensor transform operations more easily and with fewer lines of code.
-
-.. image:: ../data/ck_layer.png
-   :alt: CK Layers
-   
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
index e8617a09ef..fe8a1c1d79 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -28,6 +28,7 @@ external_toc_path = "./sphinx/_toc.yml"
 
 docs_core = ROCmDocs(left_nav_title)
 docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml")
+docs_core.enable_api_reference()
 docs_core.setup()
 
 external_projects_current_project = "composable_kernel"
diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
index fac9e138e1..d6f38e0ca9 100644
--- a/docs/doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -1,4 +1,4 @@
-# Doxyfile 1.8.10
+# Doxyfile 1.9.7
 
 # This file describes the settings to be used by the documentation system
 # doxygen (www.doxygen.org) for a project.
@@ -12,16 +12,26 @@
 # For lists, items can also be appended using:
 # TAG += value [value, ...]
 # Values that contain spaces should be placed between quotes (\" \").
+#
+# Note:
+#
+# Use doxygen to compare the used configuration file with the template
+# configuration file:
+# doxygen -x [configFile]
+# Use doxygen to compare the used configuration file with the template
+# configuration file without replacing the environment variables or CMake type
+# replacement variables:
+# doxygen -x_noenv [configFile]
 
 #---------------------------------------------------------------------------
 # Project related configuration options
 #---------------------------------------------------------------------------
 
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
-# for the list of possible encodings.
+# This tag specifies the encoding used for all characters in the configuration
+# file that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
 # The default value is: UTF-8.
 
 DOXYFILE_ENCODING      = UTF-8
@@ -44,14 +54,14 @@ PROJECT_NUMBER         = v3.0.1.0
 # for a project that appears at the top of each page and should give viewer a
 # quick idea about the purpose of the project. Keep the description short.
 
-PROJECT_BRIEF          = "prototype interfaces compatible with ROCm platform and HiP"
+PROJECT_BRIEF          = "prototype interfaces compatible with ROCm platform and HIP"
 
 # With the PROJECT_LOGO tag one can specify a logo or an icon that is included
 # in the documentation. The maximum height of the logo should not exceed 55
 # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
 # the logo to the output directory.
 
-PROJECT_LOGO           = 
+PROJECT_LOGO           =
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
 # into which the generated documentation will be written. If a relative path is
@@ -60,16 +70,28 @@ PROJECT_LOGO           =
 
 OUTPUT_DIRECTORY       = .
 
-# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
-# directories (in 2 levels) under the output directory of each output format and
-# will distribute the generated files over these directories. Enabling this
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create up to 4096
+# sub-directories (in 2 levels) under the output directory of each output format
+# and will distribute the generated files over these directories. Enabling this
 # option can be useful when feeding doxygen a huge amount of source files, where
 # putting all generated files in the same directory would otherwise causes
-# performance problems for the file system.
+# performance problems for the file system. Adapt CREATE_SUBDIRS_LEVEL to
+# control the number of sub-directories.
 # The default value is: NO.
 
 CREATE_SUBDIRS         = NO
 
+# Controls the number of sub-directories that will be created when
+# CREATE_SUBDIRS tag is set to YES. Level 0 represents 16 directories, and every
+# level increment doubles the number of directories, resulting in 4096
+# directories at level 8 which is the default and also the maximum value. The
+# sub-directories are organized in 2 levels, the first level always has a fixed
+# number of 16 directories.
+# Minimum value: 0, maximum value: 8, default value: 8.
+# This tag requires that the tag CREATE_SUBDIRS is set to YES.
+
+CREATE_SUBDIRS_LEVEL   = 8
+
 # If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
 # characters to appear in the names of generated files. If set to NO, non-ASCII
 # characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
@@ -81,14 +103,14 @@ ALLOW_UNICODE_NAMES    = NO
 # The OUTPUT_LANGUAGE tag is used to specify the language in which all
 # documentation generated by doxygen is written. Doxygen will use this
 # information to generate all constant output in the proper language.
-# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
-# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
-# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
-# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
-# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
-# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
-# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
-# Ukrainian and Vietnamese.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Bulgarian,
+# Catalan, Chinese, Chinese-Traditional, Croatian, Czech, Danish, Dutch, English
+# (United States), Esperanto, Farsi (Persian), Finnish, French, German, Greek,
+# Hindi, Hungarian, Indonesian, Italian, Japanese, Japanese-en (Japanese with
+# English messages), Korean, Korean-en (Korean with English messages), Latvian,
+# Lithuanian, Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese,
+# Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish,
+# Swedish, Turkish, Ukrainian and Vietnamese.
 # The default value is: English.
 
 OUTPUT_LANGUAGE        = English
@@ -162,7 +184,8 @@ FULL_PATH_NAMES        = YES
 # will be relative from the directory where doxygen is started.
 # This tag requires that the tag FULL_PATH_NAMES is set to YES.
 
-STRIP_FROM_PATH        =
+#STRIP_FROM_PATH        = 
+STRIP_FROM_PATH        = /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/latest/ 
 
 # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
 # path mentioned in the documentation of a class, which tells the reader which
@@ -171,7 +194,8 @@ STRIP_FROM_PATH        =
 # specify the list of include paths that are normally passed to the compiler
 # using the -I flag.
 
-STRIP_FROM_INC_PATH    =
+STRIP_FROM_INC_PATH    = 
+
 
 # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
 # less readable) file names. This can be useful is your file systems doesn't
@@ -189,6 +213,16 @@ SHORT_NAMES            = NO
 
 JAVADOC_AUTOBRIEF      = NO
 
+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
+# such as
+# /***************
+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
+# Javadoc-style will behave just like regular comments and it will not be
+# interpreted by doxygen.
+# The default value is: NO.
+
+JAVADOC_BANNER         = NO
+
 # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
 # line (until the first dot) of a Qt-style comment as the brief description. If
 # set to NO, the Qt-style will behave just like regular Qt-style comments (thus
@@ -209,6 +243,14 @@ QT_AUTOBRIEF           = NO
 
 MULTILINE_CPP_IS_BRIEF = NO
 
+# By default Python docstrings are displayed as preformatted text and doxygen's
+# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the
+# doxygen's special commands can be used and the contents of the docstring
+# documentation blocks is shown as doxygen documentation.
+# The default value is: YES.
+
+PYTHON_DOCSTRING       = YES
+
 # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
 # documentation from any documented member that it re-implements.
 # The default value is: YES.
@@ -232,20 +274,19 @@ TAB_SIZE               = 4
 # the documentation. An alias has the form:
 # name=value
 # For example adding
-# "sideeffect=@par Side Effects:\n"
+# "sideeffect=@par Side Effects:^^"
 # will allow you to put the command \sideeffect (or @sideeffect) in the
 # documentation, which will result in a user-defined paragraph with heading
-# "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines.
+# "Side Effects:". Note that you cannot put \n's in the value part of an alias
+# to insert newlines (in the resulting output). You can put ^^ in the value part
+# of an alias to insert a newline as if a physical newline was in the original
+# file. When you need a literal { or } or , in the value part of an alias you
+# have to escape them by means of a backslash (\), this can lead to conflicts
+# with the commands \{ and \} for these it is advised to use the version @{ and
+# @} or use a double escape (\\{ and \\})
 
 ALIASES                =
 
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              =
-
 # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
 # only. Doxygen will then generate output that is more tailored for C. For
 # instance, some of the names that are used will be different. The list of all
@@ -274,28 +315,40 @@ OPTIMIZE_FOR_FORTRAN   = NO
 
 OPTIMIZE_OUTPUT_VHDL   = NO
 
+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
+# sources only. Doxygen will then generate output that is more tailored for that
+# language. For instance, namespaces will be presented as modules, types will be
+# separated into more groups, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_SLICE  = NO
+
 # Doxygen selects the parser to use depending on the extension of the files it
 # parses. With this tag you can assign which parser to use for a given
 # extension. Doxygen has a built-in mapping, but you can override or extend it
 # using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
-# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
-# Fortran. In the later case the parser tries to guess whether the code is fixed
-# or free formatted code, this is the default for Fortran type files), VHDL. For
-# instance to make doxygen treat .inc files as Fortran files (default is PHP),
-# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+# language is one of the parsers supported by doxygen: IDL, Java, JavaScript,
+# Csharp (C#), C, C++, Lex, D, PHP, md (Markdown), Objective-C, Python, Slice,
+# VHDL, Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
+# tries to guess whether the code is fixed or free formatted code, this is the
+# default for Fortran type files). For instance to make doxygen treat .inc files
+# as Fortran files (default is PHP), and .f files as C (default is Fortran),
+# use: inc=Fortran f=C.
 #
 # Note: For files without extension you can use no_extension as a placeholder.
 #
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
+# the files are not read by doxygen. When specifying no_extension you should add
+# * to the FILE_PATTERNS.
+#
+# Note see also the list of default file extension mappings.
 
 EXTENSION_MAPPING      =
 
 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
 # according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# documentation. See https://daringfireball.net/projects/markdown/ for details.
 # The output of markdown processing is further processed by doxygen, so you can
 # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
 # case of backward compatibilities issues.
@@ -303,6 +356,26 @@ EXTENSION_MAPPING      =
 
 MARKDOWN_SUPPORT       = YES
 
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 5.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 5
+
+# The MARKDOWN_ID_STYLE tag can be used to specify the algorithm used to
+# generate identifiers for the Markdown headings. Note: Every identifier is
+# unique.
+# Possible values are: DOXYGEN Use a fixed 'autotoc_md' string followed by a
+# sequence number starting at 0. and GITHUB Use the lower case version of title
+# with any whitespace replaced by '-' and punctations characters removed..
+# The default value is: DOXYGEN.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+MARKDOWN_ID_STYLE      = DOXYGEN
+
 # When enabled doxygen tries to link words that correspond to documented
 # classes, or namespaces to their corresponding documentation. Such a link can
 # be prevented in individual cases by putting a % sign in front of the word or
@@ -328,7 +401,7 @@ BUILTIN_STL_SUPPORT    = YES
 CPP_CLI_SUPPORT        = NO
 
 # Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
 # will parse them like normal C++ but will assume all classes use public instead
 # of private inheritance when no explicit protection keyword is present.
 # The default value is: NO.
@@ -414,6 +487,27 @@ TYPEDEF_HIDES_STRUCT   = YES
 
 LOOKUP_CACHE_SIZE      = 0
 
+# The NUM_PROC_THREADS specifies the number of threads doxygen is allowed to use
+# during processing. When set to 0 doxygen will based this on the number of
+# cores available in the system. You can set it explicitly to a value larger
+# than 0 to get more control over the balance between CPU load and processing
+# speed. At this moment only the input processing can be done using multiple
+# threads. Since this is still an experimental feature the default is set to 1,
+# which effectively disables parallel processing. Please report any issues you
+# encounter. Generating dot graphs in parallel is controlled by the
+# DOT_NUM_THREADS setting.
+# Minimum value: 0, maximum value: 32, default value: 1.
+
+NUM_PROC_THREADS       = 1
+
+# If the TIMESTAMP tag is set different from NO then each generated page will
+# contain the date or date and time when the page was generated. Setting this to
+# NO can help when comparing the output of multiple runs.
+# Possible values are: YES, NO, DATETIME and DATE.
+# The default value is: NO.
+
+TIMESTAMP              = YES
+
 #---------------------------------------------------------------------------
 # Build related configuration options
 #---------------------------------------------------------------------------
@@ -434,6 +528,12 @@ EXTRACT_ALL            = YES
 
 EXTRACT_PRIVATE        = NO
 
+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
+# methods of a class will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIV_VIRTUAL   = NO
+
 # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
 # scope will be included in the documentation.
 # The default value is: NO.
@@ -471,6 +571,13 @@ EXTRACT_LOCAL_METHODS  = NO
 
 EXTRACT_ANON_NSPACES   = NO
 
+# If this flag is set to YES, the name of an unnamed parameter in a declaration
+# will be determined by the corresponding definition. By default unnamed
+# parameters remain unnamed in the output.
+# The default value is: YES.
+
+RESOLVE_UNNAMED_PARAMS = YES
+
 # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
 # undocumented members inside documented classes or files. If set to NO these
 # members will be included in the various overviews, but no documentation
@@ -482,14 +589,15 @@ HIDE_UNDOC_MEMBERS     = NO
 # If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
 # undocumented classes that are normally visible in the class hierarchy. If set
 # to NO, these classes will be included in the various overviews. This option
-# has no effect if EXTRACT_ALL is enabled.
+# will also hide undocumented C++ concepts if enabled. This option has no effect
+# if EXTRACT_ALL is enabled.
 # The default value is: NO.
 
 HIDE_UNDOC_CLASSES     = NO
 
 # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO, these declarations will be
-# included in the documentation.
+# declarations. If set to NO, these declarations will be included in the
+# documentation.
 # The default value is: NO.
 
 HIDE_FRIEND_COMPOUNDS  = NO
@@ -508,12 +616,20 @@ HIDE_IN_BODY_DOCS      = NO
 
 INTERNAL_DOCS          = NO
 
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES, upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
-# The default value is: system dependent.
+# With the correct setting of option CASE_SENSE_NAMES doxygen will better be
+# able to match the capabilities of the underlying filesystem. In case the
+# filesystem is case sensitive (i.e. it supports files in the same directory
+# whose names only differ in casing), the option must be set to YES to properly
+# deal with such files in case they appear in the input. For filesystems that
+# are not case sensitive the option should be set to NO to properly deal with
+# output files written for symbols that only differ in casing, such as for two
+# classes, one named CLASS and the other named Class, and to also support
+# references to files without having to specify the exact matching casing. On
+# Windows (including Cygwin) and MacOS, users should typically set this option
+# to NO, whereas on Linux or other Unix flavors it should typically be set to
+# YES.
+# Possible values are: SYSTEM, NO and YES.
+# The default value is: SYSTEM.
 
 CASE_SENSE_NAMES       = NO
 
@@ -531,6 +647,12 @@ HIDE_SCOPE_NAMES       = NO
 
 HIDE_COMPOUND_REFERENCE= NO
 
+# If the SHOW_HEADERFILE tag is set to YES then the documentation for a class
+# will show which file needs to be included to use the class.
+# The default value is: YES.
+
+SHOW_HEADERFILE        = YES
+
 # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
 # the files that are included by a file in the documentation of that file.
 # The default value is: YES.
@@ -688,7 +810,8 @@ FILE_VERSION_FILTER    =
 # output files in an output format independent way. To create the layout file
 # that represents doxygen's defaults, run doxygen with the -l option. You can
 # optionally specify a file name after the option, if omitted DoxygenLayout.xml
-# will be used as the name of the layout file.
+# will be used as the name of the layout file. See also section "Changing the
+# layout of pages" for information.
 #
 # Note that if you run doxygen from a directory containing a file called
 # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
@@ -699,7 +822,7 @@ LAYOUT_FILE            =
 # The CITE_BIB_FILES tag can be used to specify one or more bib files containing
 # the reference definitions. This must be a list of .bib files. The .bib
 # extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
 # For LaTeX the style of the bibliography can be controlled using
 # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
 # search path. See also \cite for info how to create references.
@@ -734,34 +857,81 @@ WARNINGS               = YES
 WARN_IF_UNDOCUMENTED   = YES
 
 # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some parameters
-# in a documented function, or documenting parameters that don't exist or using
-# markup commands wrongly.
+# potential errors in the documentation, such as documenting some parameters in
+# a documented function twice, or documenting parameters that don't exist or
+# using markup commands wrongly.
 # The default value is: YES.
 
 WARN_IF_DOC_ERROR      = YES
 
+# If WARN_IF_INCOMPLETE_DOC is set to YES, doxygen will warn about incomplete
+# function parameter documentation. If set to NO, doxygen will accept that some
+# parameters have no documentation without warning.
+# The default value is: YES.
+
+WARN_IF_INCOMPLETE_DOC = YES
+
 # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
 # are documented, but have no documentation for their parameters or return
-# value. If set to NO, doxygen will only warn about wrong or incomplete
-# parameter documentation, but not about the absence of documentation.
+# value. If set to NO, doxygen will only warn about wrong parameter
+# documentation, but not about the absence of documentation. If EXTRACT_ALL is
+# set to YES then this flag will automatically be disabled. See also
+# WARN_IF_INCOMPLETE_DOC
 # The default value is: NO.
 
 WARN_NO_PARAMDOC       = NO
 
+# If WARN_IF_UNDOC_ENUM_VAL option is set to YES, doxygen will warn about
+# undocumented enumeration values. If set to NO, doxygen will accept
+# undocumented enumeration values. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: NO.
+
+WARN_IF_UNDOC_ENUM_VAL = NO
+
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
+# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
+# at the end of the doxygen process doxygen will return with a non-zero status.
+# If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS_PRINT then doxygen behaves
+# like FAIL_ON_WARNINGS but in case no WARN_LOGFILE is defined doxygen will not
+# write the warning messages in between other messages but write them at the end
+# of a run, in case a WARN_LOGFILE is defined the warning messages will be
+# besides being in the defined file also be shown at the end of a run, unless
+# the WARN_LOGFILE is defined as - i.e. standard output (stdout) in that case
+# the behavior will remain as with the setting FAIL_ON_WARNINGS.
+# Possible values are: NO, YES, FAIL_ON_WARNINGS and FAIL_ON_WARNINGS_PRINT.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
+
 # The WARN_FORMAT tag determines the format of the warning messages that doxygen
 # can produce. The string should contain the $file, $line, and $text tags, which
 # will be replaced by the file and line number from which the warning originated
 # and the warning text. Optionally the format may contain $version, which will
 # be replaced by the version of the file (if it could be obtained via
 # FILE_VERSION_FILTER)
+# See also: WARN_LINE_FORMAT
 # The default value is: $file:$line: $text.
 
 WARN_FORMAT            = "$file:$line: $text"
 
+# In the $text part of the WARN_FORMAT command it is possible that a reference
+# to a more specific place is given. To make it easier to jump to this place
+# (outside of doxygen) the user can define a custom "cut" / "paste" string.
+# Example:
+# WARN_LINE_FORMAT = "'vi $file +$line'"
+# See also: WARN_FORMAT
+# The default value is: at line $line of file $file.
+
+WARN_LINE_FORMAT       = "at line $line of file $file"
+
 # The WARN_LOGFILE tag can be used to specify a file to which warning and error
 # messages should be written. If left blank the output is written to standard
-# error (stderr).
+# error (stderr). In case the file specified cannot be opened for writing the
+# warning and error messages are written to standard error. When as file - is
+# specified the warning and error messages are written to standard output
+# (stdout).
 
 WARN_LOGFILE           =
 
@@ -785,12 +955,23 @@ INPUT                  = ../../include/ck/tensor_operation/gpu/grid \
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
 # libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
-# possible encodings.
+# documentation (see:
+# https://www.gnu.org/software/libiconv/) for the list of possible encodings.
+# See also: INPUT_FILE_ENCODING
 # The default value is: UTF-8.
 
 INPUT_ENCODING         = UTF-8
 
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses The INPUT_FILE_ENCODING tag can be used to specify
+# character encoding on a per file pattern basis. Doxygen will compare the file
+# name with each pattern and apply the encoding instead of the default
+# INPUT_ENCODING) if there is a match. The character encodings are a list of the
+# form: pattern=encoding (like *.php=ISO-8859-1). See cfg_input_encoding
+# "INPUT_ENCODING" for further information on supported encodings.
+
+INPUT_FILE_ENCODING    =
+
 # If the value of the INPUT tag contains directories, you can use the
 # FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
 # *.h) to filter out the source-files in the directories.
@@ -799,11 +980,15 @@ INPUT_ENCODING         = UTF-8
 # need to set EXTENSION_MAPPING for the extension otherwise the files are not
 # read by doxygen.
 #
+# Note the list of default checked file patterns might differ from the list of
+# default file extension mappings.
+#
 # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
 # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
-# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd,
-# *.vhdl, *.ucf, *.qsf, *.as and *.js.
+# *.hh, *.hxx, *.hpp, *.h++, *.l, *.cs, *.d, *.php, *.php4, *.php5, *.phtml,
+# *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C
+# comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd,
+# *.vhdl, *.ucf, *.qsf and *.ice.
 
 FILE_PATTERNS          = *.c \
                          *.cc \
@@ -824,6 +1009,7 @@ FILE_PATTERNS          = *.c \
                          *.hxx \
                          *.hpp \
                          *.h++ \
+                         *.l \
                          *.cs \
                          *.d \
                          *.php \
@@ -837,13 +1023,19 @@ FILE_PATTERNS          = *.c \
                          *.mm \
                          *.dox \
                          *.py \
-                         *.tcl \
+                         *.pyw \
+                         *.f90 \
+                         *.f95 \
+                         *.f03 \
+                         *.f08 \
+                         *.f18 \
+                         *.f \
+                         *.for \
                          *.vhd \
                          *.vhdl \
                          *.ucf \
                          *.qsf \
-                         *.as \
-                         *.js
+                         *.ice
 
 # The RECURSIVE tag can be used to specify whether or not subdirectories should
 # be searched for input files as well.
@@ -880,10 +1072,7 @@ EXCLUDE_PATTERNS       =
 # (namespaces, classes, functions, etc.) that should be excluded from the
 # output. The symbol name can be a fully qualified name, a word, or if the
 # wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories use the pattern */test/*
+# ANamespace::AClass, ANamespace::*Test
 
 EXCLUDE_SYMBOLS        =
 
@@ -927,6 +1116,15 @@ IMAGE_PATH             =
 # Note that the filter must not add or remove lines; it is applied before the
 # code is scanned, but not when the output code is generated. If lines are added
 # or removed, the anchors will not be placed correctly.
+#
+# Note that doxygen will use the data processed and written to standard output
+# for further processing, therefore nothing else, like debug statements or used
+# commands (so in case of a Windows batch file always use @echo OFF), should be
+# written to standard output.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
 
 INPUT_FILTER           =
 
@@ -936,6 +1134,10 @@ INPUT_FILTER           =
 # (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
 # filters are used. If the FILTER_PATTERNS tag is empty or if none of the
 # patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
 
 FILTER_PATTERNS        =
 
@@ -959,7 +1161,16 @@ FILTER_SOURCE_PATTERNS =
 # (index.html). This can be useful if you have a project on for instance GitHub
 # and want to reuse the introduction page also for the doxygen output.
 
-USE_MDFILE_AS_MAINPAGE = ../README.md
+USE_MDFILE_AS_MAINPAGE = ../../README.md
+
+# The Fortran standard specifies that for fixed formatted Fortran code all
+# characters from position 72 are to be considered as comment. A common
+# extension is to allow longer lines before the automatic comment starts. The
+# setting FORTRAN_COMMENT_AFTER will also make it possible that longer lines can
+# be processed before the automatic comment starts.
+# Minimum value: 7, maximum value: 10000, default value: 72.
+
+FORTRAN_COMMENT_AFTER  = 72
 
 #---------------------------------------------------------------------------
 # Configuration options related to source browsing
@@ -988,7 +1199,7 @@ INLINE_SOURCES         = NO
 STRIP_CODE_COMMENTS    = YES
 
 # If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
+# entity all documented functions referencing it will be listed.
 # The default value is: NO.
 
 REFERENCED_BY_RELATION = NO
@@ -1020,12 +1231,12 @@ SOURCE_TOOLTIPS        = YES
 # If the USE_HTAGS tag is set to YES then the references to source code will
 # point to the HTML generated by the htags(1) tool instead of doxygen built-in
 # source browser. The htags tool is part of GNU's global source tagging system
-# (see http://www.gnu.org/software/global/global.html). You will need version
+# (see https://www.gnu.org/software/global/global.html). You will need version
 # 4.8.6 or higher.
 #
 # To use it do the following:
 # - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
 # - Make sure the INPUT points to the root of the source tree
 # - Run doxygen as normal
 #
@@ -1047,25 +1258,6 @@ USE_HTAGS              = NO
 
 VERBATIM_HEADERS       = YES
 
-# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
-# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
-# cost of reduced performance. This can be particularly helpful with template
-# rich C++ code for which doxygen's built-in parser lacks the necessary type
-# information.
-# Note: The availability of this option depends on whether or not doxygen was
-# compiled with the --with-libclang option.
-# The default value is: NO.
-
-CLANG_ASSISTED_PARSING = NO
-
-# If clang assisted parsing is enabled you can provide the compiler with command
-# line options that you would normally use when invoking the compiler. Note that
-# the include paths will already be set by doxygen for the files and directories
-# specified with INPUT and INCLUDE_PATH.
-# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
-
-CLANG_OPTIONS          =
-
 #---------------------------------------------------------------------------
 # Configuration options related to the alphabetical class index
 #---------------------------------------------------------------------------
@@ -1077,17 +1269,11 @@ CLANG_OPTIONS          =
 
 ALPHABETICAL_INDEX     = YES
 
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
-# In case all classes in a project start with a common prefix, all classes will
-# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
-# can be used to specify a prefix (or a list of prefixes) that should be ignored
-# while generating the index headers.
+# The IGNORE_PREFIX tag can be used to specify a prefix (or a list of prefixes)
+# that should be ignored while generating the index headers. The IGNORE_PREFIX
+# tag works for classes, function and member names. The entity will be placed in
+# the alphabetical list under the first letter of the entity name that remains
+# after removing the prefix.
 # This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
 
 IGNORE_PREFIX          =
@@ -1134,7 +1320,7 @@ HTML_FILE_EXTENSION    = .html
 # of the possible markers and block names see the documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_HEADER            =
+HTML_HEADER            = ../_doxygen/header.html
 
 # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
 # generated HTML page. If the tag is left blank doxygen will generate a standard
@@ -1144,7 +1330,7 @@ HTML_HEADER            =
 # that doxygen normally uses.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_FOOTER            =
+HTML_FOOTER            = ../_doxygen/footer.html
 
 # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
 # sheet that is used by each HTML page. It can be used to fine-tune the look of
@@ -1156,7 +1342,7 @@ HTML_FOOTER            =
 # obsolete.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_STYLESHEET        =
+HTML_STYLESHEET        = ../_doxygen/stylesheet.css
 
 # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
 # cascading style sheets that are included after the standard style sheets
@@ -1166,10 +1352,15 @@ HTML_STYLESHEET        =
 # Doxygen will copy the style sheet files to the output directory.
 # Note: The order of the extra style sheet files is of importance (e.g. the last
 # style sheet in the list overrules the setting of the previous ones in the
-# list). For an example see the documentation.
+# list).
+# Note: Since the styling of scrollbars can currently not be overruled in
+# Webkit/Chromium, the styling will be left out of the default doxygen.css if
+# one or more extra stylesheets have been specified. So if scrollbar
+# customization is desired it has to be added explicitly. For an example see the
+# documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_EXTRA_STYLESHEET  =
+HTML_EXTRA_STYLESHEET  = ../_doxygen/extra_stylesheet.css
 
 # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
 # other source files which should be copied to the HTML output directory. Note
@@ -1181,19 +1372,32 @@ HTML_EXTRA_STYLESHEET  =
 
 HTML_EXTRA_FILES       =
 
+# The HTML_COLORSTYLE tag can be used to specify if the generated HTML output
+# should be rendered with a dark or light theme.
+# Possible values are: LIGHT always generate light mode output, DARK always
+# generate dark mode output, AUTO_LIGHT automatically set the mode according to
+# the user preference, use light mode if no preference is set (the default),
+# AUTO_DARK automatically set the mode according to the user preference, use
+# dark mode if no preference is set and TOGGLE allow to user to switch between
+# light and dark mode via a button.
+# The default value is: AUTO_LIGHT.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE        = LIGHT
+
 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
 # will adjust the colors in the style sheet and background images according to
-# this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# this color. Hue is specified as an angle on a color-wheel, see
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
 # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
 # purple, and 360 is red again.
 # Minimum value: 0, maximum value: 359, default value: 220.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_COLORSTYLE_HUE    = 220
+HTML_COLORSTYLE_HUE    = 240
 
 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
-# in the HTML output. For a value of 0 the output will use grayscales only. A
+# in the HTML output. For a value of 0 the output will use gray-scales only. A
 # value of 255 will produce the most vivid colors.
 # Minimum value: 0, maximum value: 255, default value: 100.
 # This tag requires that the tag GENERATE_HTML is set to YES.
@@ -1211,14 +1415,16 @@ HTML_COLORSTYLE_SAT    = 100
 
 HTML_COLORSTYLE_GAMMA  = 80
 
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting this
-# to YES can help to show when doxygen was last run and thus if the
-# documentation is up to date.
-# The default value is: NO.
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via JavaScript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have JavaScript,
+# like the Qt help browser.
+# The default value is: YES.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_TIMESTAMP         = NO
+HTML_DYNAMIC_MENUS     = YES
 
 # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
 # documentation will contain sections that can be hidden and shown after the
@@ -1243,13 +1449,14 @@ HTML_INDEX_NUM_ENTRIES = 100
 
 # If the GENERATE_DOCSET tag is set to YES, additional index files will be
 # generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
+# environment (see:
+# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To
+# create a documentation set, doxygen will generate a Makefile in the HTML
+# output directory. Running make will produce the docset in that directory and
+# running make install will install the docset in
 # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
+# genXcode/_index.html for more information.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
@@ -1263,6 +1470,13 @@ GENERATE_DOCSET        = NO
 
 DOCSET_FEEDNAME        = "Doxygen generated docs"
 
+# This tag determines the URL of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDURL         =
+
 # This tag specifies a string that should uniquely identify the documentation
 # set bundle. This should be a reverse domain-name style string, e.g.
 # com.mycompany.MyDocSet. Doxygen will append .docset to the name.
@@ -1288,8 +1502,12 @@ DOCSET_PUBLISHER_NAME  = Publisher
 # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
 # additional HTML index files: index.hhp, index.hhc, and index.hhk. The
 # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
+# on Windows. In the beginning of 2021 Microsoft took the original page, with
+# a.o. the download links, offline the HTML help workshop was already many years
+# in maintenance mode). You can download the HTML help workshop from the web
+# archives at Installation executable (see:
+# http://web.archive.org/web/20160201063255/http://download.microsoft.com/downlo
+# ad/0/A/9/0A939EF6-E31C-430F-A3DF-DFAE7960D564/htmlhelp.exe).
 #
 # The HTML Help Workshop contains a compiler that can convert all HTML output
 # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
@@ -1319,7 +1537,7 @@ CHM_FILE               =
 HHC_LOCATION           =
 
 # The GENERATE_CHI flag controls if a separate .chi index file is generated
-# (YES) or that it should be included in the master .chm file (NO).
+# (YES) or that it should be included in the main .chm file (NO).
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
@@ -1346,6 +1564,16 @@ BINARY_TOC             = NO
 
 TOC_EXPAND             = NO
 
+# The SITEMAP_URL tag is used to specify the full URL of the place where the
+# generated documentation will be placed on the server by the user during the
+# deployment of the documentation. The generated sitemap is called sitemap.xml
+# and placed on the directory specified by HTML_OUTPUT. In case no SITEMAP_URL
+# is specified no sitemap is generated. For information about the sitemap
+# protocol see https://www.sitemaps.org
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SITEMAP_URL            =
+
 # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
 # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
 # can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
@@ -1364,7 +1592,8 @@ QCH_FILE               =
 
 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
 # The default value is: org.doxygen.Project.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1372,8 +1601,8 @@ QHP_NAMESPACE          = org.doxygen.Project
 
 # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
 # Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
-# folders).
+# Folders (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders).
 # The default value is: doc.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1381,30 +1610,30 @@ QHP_VIRTUAL_FOLDER     = doc
 
 # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
 # filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_NAME   =
 
 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_ATTRS  =
 
 # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
 # project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_SECT_FILTER_ATTRS  =
 
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
+# The QHG_LOCATION tag can be used to specify the location (absolute path
+# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to
+# run qhelpgenerator on the generated .qhp file.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHG_LOCATION           =
@@ -1447,16 +1676,28 @@ DISABLE_INDEX          = NO
 # to work a browser that supports JavaScript, DHTML, CSS and frames is required
 # (i.e. any modern browser). Windows users are probably better off using the
 # HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
-# further fine-tune the look of the index. As an example, the default style
-# sheet generated by doxygen has an example that shows how to put an image at
-# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
-# the same information as the tab index, you could consider setting
-# DISABLE_INDEX to YES when enabling this option.
+# further fine tune the look of the index (see "Fine-tuning the output"). As an
+# example, the default style sheet generated by doxygen has an example that
+# shows how to put an image at the root of the tree instead of the PROJECT_NAME.
+# Since the tree basically has the same information as the tab index, you could
+# consider setting DISABLE_INDEX to YES when enabling this option.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
 GENERATE_TREEVIEW      = NO
 
+# When both GENERATE_TREEVIEW and DISABLE_INDEX are set to YES, then the
+# FULL_SIDEBAR option determines if the side bar is limited to only the treeview
+# area (value NO) or if it should extend to the full height of the window (value
+# YES). Setting this to YES gives a layout similar to
+# https://docs.readthedocs.io with more room for contents, but less room for the
+# project logo, title, and description. If either GENERATE_TREEVIEW or
+# DISABLE_INDEX is set to NO, this option has no effect.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FULL_SIDEBAR           = NO
+
 # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
 # doxygen will group on one line in the generated HTML documentation.
 #
@@ -1481,6 +1722,24 @@ TREEVIEW_WIDTH         = 250
 
 EXT_LINKS_IN_WINDOW    = NO
 
+# If the OBFUSCATE_EMAILS tag is set to YES, doxygen will obfuscate email
+# addresses.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+OBFUSCATE_EMAILS       = YES
+
+# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg
+# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
+# https://inkscape.org) to generate formulas as SVG images instead of PNGs for
+# the HTML output. These images will generally look nicer at scaled resolutions.
+# Possible values are: png (the default) and svg (looks nicer but requires the
+# pdf2svg or inkscape tool).
+# The default value is: png.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FORMULA_FORMAT    = png
+
 # Use this tag to change the font size of LaTeX formulas included as images in
 # the HTML documentation. When you change the font size after a successful
 # doxygen run you need to manually remove any form_*.png images from the HTML
@@ -1490,19 +1749,14 @@ EXT_LINKS_IN_WINDOW    = NO
 
 FORMULA_FONTSIZE       = 10
 
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
-# generated for formulas are transparent PNGs. Transparent PNGs are not
-# supported properly for IE 6.0, but are supported on all modern browsers.
-#
-# Note that when changing this option you need to delete any form_*.png files in
-# the HTML output directory before the changes have effect.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
+# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands
+# to create new LaTeX commands to be used in formulas as building blocks. See
+# the section "Including formulas" for details.
 
-FORMULA_TRANSPARENT    = YES
+FORMULA_MACROFILE      =
 
 # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# http://www.mathjax.org) which uses client side Javascript for the rendering
+# https://www.mathjax.org) which uses client side JavaScript for the rendering
 # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
 # installed or if you want to formulas look prettier in the HTML output. When
 # enabled you may also need to install MathJax separately and configure the path
@@ -1512,11 +1766,29 @@ FORMULA_TRANSPARENT    = YES
 
 USE_MATHJAX            = YES
 
+# With MATHJAX_VERSION it is possible to specify the MathJax version to be used.
+# Note that the different versions of MathJax have different requirements with
+# regards to the different settings, so it is possible that also other MathJax
+# settings have to be changed when switching between the different MathJax
+# versions.
+# Possible values are: MathJax_2 and MathJax_3.
+# The default value is: MathJax_2.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_VERSION        = MathJax_2
+
 # When MathJax is enabled you can set the default output format to be used for
-# the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
+# the MathJax output. For more details about the output format see MathJax
+# version 2 (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) and MathJax version 3
+# (see:
+# http://docs.mathjax.org/en/latest/web/components/output.html).
 # Possible values are: HTML-CSS (which is slower, but has the best
-# compatibility), NativeMML (i.e. MathML) and SVG.
+# compatibility. This is the name for Mathjax version 2, for MathJax version 3
+# this will be translated into chtml), NativeMML (i.e. MathML. Only supported
+# for NathJax 2. For MathJax version 3 chtml will be used instead.), chtml (This
+# is the name for Mathjax version 3, for MathJax version 2 this will be
+# translated into HTML-CSS) and SVG.
 # The default value is: HTML-CSS.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
@@ -1529,22 +1801,29 @@ MATHJAX_FORMAT         = HTML-CSS
 # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
 # Content Delivery Network so you can quickly see the result without installing
 # MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# MathJax from https://www.mathjax.org before deployment. The default value is:
+# - in case of MathJax version 2: https://cdn.jsdelivr.net/npm/mathjax@2
+# - in case of MathJax version 3: https://cdn.jsdelivr.net/npm/mathjax@3
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+MATHJAX_RELPATH        =
 
 # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
 # extension names that should be enabled during MathJax rendering. For example
+# for MathJax version 2 (see
+# https://docs.mathjax.org/en/v2.7-latest/tex.html#tex-and-latex-extensions):
 # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# For example for MathJax version 3 (see
+# http://docs.mathjax.org/en/latest/input/tex/extensions/index.html):
+# MATHJAX_EXTENSIONS = ams
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
 MATHJAX_EXTENSIONS     =
 
 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
 # of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an
 # example see the documentation.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
@@ -1572,7 +1851,7 @@ MATHJAX_CODEFILE       =
 SEARCHENGINE           = YES
 
 # When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
+# implemented using a web server instead of a web client using JavaScript. There
 # are two flavors of web server based searching depending on the EXTERNAL_SEARCH
 # setting. When disabled, doxygen will generate a PHP script for searching and
 # an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
@@ -1591,7 +1870,8 @@ SERVER_BASED_SEARCH    = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/).
+# Xapian (see:
+# https://xapian.org/).
 #
 # See the section "External Indexing and Searching" for details.
 # The default value is: NO.
@@ -1604,8 +1884,9 @@ EXTERNAL_SEARCH        = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/). See the section "External Indexing and
-# Searching" for details.
+# Xapian (see:
+# https://xapian.org/). See the section "External Indexing and Searching" for
+# details.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
 SEARCHENGINE_URL       =
@@ -1656,21 +1937,35 @@ LATEX_OUTPUT           = latex
 # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
 # invoked.
 #
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
+# chosen this is overwritten by pdflatex. For specific output languages the
+# default can have been set differently, this depends on the implementation of
+# the output language.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_CMD_NAME         = latex
 
 # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
 # index for LaTeX.
+# Note: This tag is used in the Makefile / make.bat.
+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
+# (.tex).
 # The default file is: makeindex.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 MAKEINDEX_CMD_NAME     = makeindex
 
+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
+# generate index for LaTeX. In case there is no backslash (\) as first character
+# it will be automatically added in the LaTeX code.
+# Note: This tag is used in the generated output file (.tex).
+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
+# The default value is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_MAKEINDEX_CMD    = makeindex
+
 # If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
 # documents. This may be useful for small projects and may help to save some
 # trees in general.
@@ -1700,29 +1995,31 @@ PAPER_TYPE             = a4
 
 EXTRA_PACKAGES         =
 
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
-# generated LaTeX document. The header should contain everything until the first
-# chapter. If it is left blank doxygen will generate a standard header. See
-# section "Doxygen usage" for information on how to let doxygen write the
-# default header to a separate file.
+# The LATEX_HEADER tag can be used to specify a user-defined LaTeX header for
+# the generated LaTeX document. The header should contain everything until the
+# first chapter. If it is left blank doxygen will generate a standard header. It
+# is highly recommended to start with a default header using
+# doxygen -w latex new_header.tex new_footer.tex new_stylesheet.sty
+# and then modify the file new_header.tex. See also section "Doxygen usage" for
+# information on how to generate the default header that doxygen normally uses.
 #
-# Note: Only use a user-defined header if you know what you are doing! The
-# following commands have a special meaning inside the header: $title,
-# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
-# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
-# string, for the replacement values of the other commands the user is referred
-# to HTML_HEADER.
+# Note: Only use a user-defined header if you know what you are doing!
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. The following
+# commands have a special meaning inside the header (and footer): For a
+# description of the possible markers and block names see the documentation.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_HEADER           =
 
-# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
-# generated LaTeX document. The footer should contain everything after the last
-# chapter. If it is left blank doxygen will generate a standard footer. See
+# The LATEX_FOOTER tag can be used to specify a user-defined LaTeX footer for
+# the generated LaTeX document. The footer should contain everything after the
+# last chapter. If it is left blank doxygen will generate a standard footer. See
 # LATEX_HEADER for more information on how to generate a default footer and what
-# special commands can be used inside the footer.
-#
-# Note: Only use a user-defined footer if you know what you are doing!
+# special commands can be used inside the footer. See also section "Doxygen
+# usage" for information on how to generate the default footer that doxygen
+# normally uses. Note: Only use a user-defined footer if you know what you are
+# doing!
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_FOOTER           =
@@ -1755,18 +2052,26 @@ LATEX_EXTRA_FILES      =
 
 PDF_HYPERLINKS         = YES
 
-# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES, to get a
-# higher quality PDF documentation.
+# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as
+# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX
+# files. Set this option to YES, to get a higher quality PDF documentation.
+#
+# See also section LATEX_CMD_NAME for selecting the engine.
 # The default value is: YES.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 USE_PDFLATEX           = YES
 
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
-# command to the generated LaTeX files. This will instruct LaTeX to keep running
-# if errors occur, instead of asking the user for help. This option is also used
-# when generating formulas in HTML.
+# The LATEX_BATCHMODE tag ignals the behavior of LaTeX in case of an error.
+# Possible values are: NO same as ERROR_STOP, YES same as BATCH, BATCH In batch
+# mode nothing is printed on the terminal, errors are scrolled as if <return> is
+# hit at every error; missing files that TeX tries to input or request from
+# keyboard input (\read on a not open input stream) cause the job to abort,
+# NON_STOP In nonstop mode the diagnostic message will appear on the terminal,
+# but there is no possibility of user interaction just like in batch mode,
+# SCROLL In scroll mode, TeX will stop only for missing files to input or if
+# keyboard input is necessary and ERROR_STOP In errorstop mode, TeX will stop at
+# each error, asking for user intervention.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
@@ -1779,24 +2084,22 @@ LATEX_BATCHMODE        = NO
 
 LATEX_HIDE_INDICES     = NO
 
-# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
-# code with syntax highlighting in the LaTeX output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_SOURCE_CODE      = NO
-
 # The LATEX_BIB_STYLE tag can be used to specify the style to use for the
 # bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
 # The default value is: plain.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_BIB_STYLE        = plain
 
+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
+# path from which the emoji images will be read. If a relative path is entered,
+# it will be relative to the LATEX_OUTPUT directory. If left blank the
+# LATEX_OUTPUT directory will be used.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EMOJI_DIRECTORY  =
+
 #---------------------------------------------------------------------------
 # Configuration options related to the RTF output
 #---------------------------------------------------------------------------
@@ -1836,9 +2139,9 @@ COMPACT_RTF            = NO
 
 RTF_HYPERLINKS         = NO
 
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# configuration file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
 #
 # See also section "Doxygen usage" for information on how to generate the
 # default style sheet that doxygen normally uses.
@@ -1847,22 +2150,12 @@ RTF_HYPERLINKS         = NO
 RTF_STYLESHEET_FILE    =
 
 # Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
+# similar to doxygen's configuration file. A template extensions file can be
+# generated using doxygen -e rtf extensionFile.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
 RTF_EXTENSIONS_FILE    =
 
-# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
-# with syntax highlighting in the RTF output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_SOURCE_CODE        = NO
-
 #---------------------------------------------------------------------------
 # Configuration options related to the man page output
 #---------------------------------------------------------------------------
@@ -1934,6 +2227,13 @@ XML_OUTPUT             = xml
 
 XML_PROGRAMLISTING     = YES
 
+# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include
+# namespace members in file scope as well, matching the HTML output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_NS_MEMB_FILE_SCOPE = NO
+
 #---------------------------------------------------------------------------
 # Configuration options related to the DOCBOOK output
 #---------------------------------------------------------------------------
@@ -1952,23 +2252,14 @@ GENERATE_DOCBOOK       = NO
 
 DOCBOOK_OUTPUT         = docbook
 
-# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
-# program listings (including syntax highlighting and cross-referencing
-# information) to the DOCBOOK output. Note that enabling this will significantly
-# increase the size of the DOCBOOK output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_PROGRAMLISTING = NO
-
 #---------------------------------------------------------------------------
 # Configuration options for the AutoGen Definitions output
 #---------------------------------------------------------------------------
 
 # If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
-# AutoGen Definitions (see http://autogen.sf.net) file that captures the
-# structure of the code including all documentation. Note that this feature is
-# still experimental and incomplete at the moment.
+# AutoGen Definitions (see https://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
 # The default value is: NO.
 
 GENERATE_AUTOGEN_DEF   = NO
@@ -2047,7 +2338,8 @@ SEARCH_INCLUDES        = NO
 
 # The INCLUDE_PATH tag can be used to specify one or more directories that
 # contain include files that are not input files but should be processed by the
-# preprocessor.
+# preprocessor. Note that the INCLUDE_PATH is not recursive, so the setting of
+# RECURSIVE has no effect here.
 # This tag requires that the tag SEARCH_INCLUDES is set to YES.
 
 INCLUDE_PATH           =
@@ -2136,41 +2428,10 @@ EXTERNAL_GROUPS        = YES
 
 EXTERNAL_PAGES         = YES
 
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /usr/bin/perl
-
 #---------------------------------------------------------------------------
-# Configuration options related to the dot tool
+# Configuration options related to diagram generator tools
 #---------------------------------------------------------------------------
 
-# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
-# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
-# NO turns the diagrams off. Note that this option also works with HAVE_DOT
-# disabled, but it is recommended to install and use dot, since it yields more
-# powerful graphs.
-# The default value is: YES.
-
-CLASS_DIAGRAMS         = NO
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
-# You can include diagrams made with dia in doxygen documentation. Doxygen will
-# then run dia to produce the diagram and insert it in the documentation. The
-# DIA_PATH tag allows you to specify the directory where the dia binary resides.
-# If left empty dia is assumed to be found in the default search path.
-
-DIA_PATH               =
-
 # If set to YES the inheritance and collaboration graphs will hide inheritance
 # and usage relations if the target is undocumented or is not a class.
 # The default value is: YES.
@@ -2179,7 +2440,7 @@ HIDE_UNDOC_RELATIONS   = YES
 
 # If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
 # available from the path. This tool is part of Graphviz (see:
-# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# https://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
 # Bell Labs. The other options in this section have no effect if this option is
 # set to NO
 # The default value is: NO.
@@ -2196,35 +2457,52 @@ HAVE_DOT               = NO
 
 DOT_NUM_THREADS        = 0
 
-# When you want a differently looking font in the dot files that doxygen
-# generates you can specify the font name using DOT_FONTNAME. You need to make
-# sure dot is able to find the font, which can be done by putting it in a
-# standard location or by setting the DOTFONTPATH environment variable or by
-# setting DOT_FONTPATH to the directory containing the font.
-# The default value is: Helvetica.
+# DOT_COMMON_ATTR is common attributes for nodes, edges and labels of
+# subgraphs. When you want a differently looking font in the dot files that
+# doxygen generates you can specify fontname, fontcolor and fontsize attributes.
+# For details please see <a href=https://graphviz.org/doc/info/attrs.html>Node,
+# Edge and Graph Attributes specification</a> You need to make sure dot is able
+# to find the font, which can be done by putting it in a standard location or by
+# setting the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the
+# directory containing the font. Default graphviz fontsize is 14.
+# The default value is: fontname=Helvetica,fontsize=10.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_FONTNAME           = Helvetica
+DOT_COMMON_ATTR        = "fontname=Helvetica,fontsize=10"
 
-# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
-# dot graphs.
-# Minimum value: 4, maximum value: 24, default value: 10.
+# DOT_EDGE_ATTR is concatenated with DOT_COMMON_ATTR. For elegant style you can
+# add 'arrowhead=open, arrowtail=open, arrowsize=0.5'. <a
+# href=https://graphviz.org/doc/info/arrows.html>Complete documentation about
+# arrows shapes.</a>
+# The default value is: labelfontname=Helvetica,labelfontsize=10.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_FONTSIZE           = 10
+DOT_EDGE_ATTR          = "labelfontname=Helvetica,labelfontsize=10"
 
-# By default doxygen will tell dot to use the default font as specified with
-# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
-# the path where dot can find it using this tag.
+# DOT_NODE_ATTR is concatenated with DOT_COMMON_ATTR. For view without boxes
+# around nodes set 'shape=plain' or 'shape=plaintext' <a
+# href=https://www.graphviz.org/doc/info/shapes.html>Shapes specification</a>
+# The default value is: shape=box,height=0.2,width=0.4.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NODE_ATTR          = "shape=box,height=0.2,width=0.4"
+
+# You can set the path where dot can find font specified with fontname in
+# DOT_COMMON_ATTR and others dot attributes.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_FONTPATH           =
 
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# If the CLASS_GRAPH tag is set to YES or GRAPH or BUILTIN then doxygen will
+# generate a graph for each documented class showing the direct and indirect
+# inheritance relations. In case the CLASS_GRAPH tag is set to YES or GRAPH and
+# HAVE_DOT is enabled as well, then dot will be used to draw the graph. In case
+# the CLASS_GRAPH tag is set to YES and HAVE_DOT is disabled or if the
+# CLASS_GRAPH tag is set to BUILTIN, then the built-in generator will be used.
+# If the CLASS_GRAPH tag is set to TEXT the direct and indirect inheritance
+# relations will be shown as texts / links.
+# Possible values are: NO, YES, TEXT, GRAPH and BUILTIN.
 # The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
 
 CLASS_GRAPH            = YES
 
@@ -2238,7 +2516,8 @@ CLASS_GRAPH            = YES
 COLLABORATION_GRAPH    = YES
 
 # If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
-# groups, showing the direct groups dependencies.
+# groups, showing the direct groups dependencies. See also the chapter Grouping
+# in the manual.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2261,10 +2540,32 @@ UML_LOOK               = NO
 # but if the number exceeds 15, the total amount of fields shown is limited to
 # 10.
 # Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
+# This tag requires that the tag UML_LOOK is set to YES.
 
 UML_LIMIT_NUM_FIELDS   = 10
 
+# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and
+# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS
+# tag is set to YES, doxygen will add type and arguments for attributes and
+# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen
+# will not generate fields with class member information in the UML graphs. The
+# class diagrams will look similar to the default class diagrams but using UML
+# notation for the relationships.
+# Possible values are: NO, YES and NONE.
+# The default value is: NO.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+DOT_UML_DETAILS        = NO
+
+# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters
+# to display on a single line. If the actual line length exceeds this threshold
+# significantly it will wrapped across multiple lines. Some heuristics are apply
+# to avoid ugly line breaks.
+# Minimum value: 0, maximum value: 1000, default value: 17.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_WRAP_THRESHOLD     = 17
+
 # If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
 # collaboration graphs will show the relations between templates and their
 # instances.
@@ -2331,10 +2632,17 @@ GRAPHICAL_HIERARCHY    = YES
 
 DIRECTORY_GRAPH        = YES
 
+# The DIR_GRAPH_MAX_DEPTH tag can be used to limit the maximum number of levels
+# of child directories generated in directory dependency graphs by dot.
+# Minimum value: 1, maximum value: 25, default value: 1.
+# This tag requires that the tag DIRECTORY_GRAPH is set to YES.
+
+DIR_GRAPH_MAX_DEPTH    = 1
+
 # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
 # generated by dot. For an explanation of the image formats see the section
 # output formats in the documentation of the dot tool (Graphviz (see:
-# http://www.graphviz.org/)).
+# https://www.graphviz.org/)).
 # Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
 # to make the SVG files visible in IE 9+ (other browsers do not have this
 # requirement).
@@ -2371,11 +2679,12 @@ DOT_PATH               =
 
 DOTFILE_DIRS           =
 
-# The MSCFILE_DIRS tag can be used to specify one or more directories that
-# contain msc files that are included in the documentation (see the \mscfile
-# command).
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
 
-MSCFILE_DIRS           =
+DIA_PATH               =
 
 # The DIAFILE_DIRS tag can be used to specify one or more directories that
 # contain dia files that are included in the documentation (see the \diafile
@@ -2384,13 +2693,18 @@ MSCFILE_DIRS           =
 DIAFILE_DIRS           =
 
 # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
-# path where java can find the plantuml.jar file. If left blank, it is assumed
-# PlantUML is not used or called during a preprocessing step. Doxygen will
-# generate a warning when it encounters a \startuml command in this case and
-# will not generate output for the diagram.
+# path where java can find the plantuml.jar file or to the filename of jar file
+# to be used. If left blank, it is assumed PlantUML is not used or called during
+# a preprocessing step. Doxygen will generate a warning when it encounters a
+# \startuml command in this case and will not generate output for the diagram.
 
 PLANTUML_JAR_PATH      =
 
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
 # When using plantuml, the specified paths are searched for files specified by
 # the !include statement in a plantuml block.
 
@@ -2420,18 +2734,6 @@ DOT_GRAPH_MAX_NODES    = 50
 
 MAX_DOT_GRAPH_DEPTH    = 0
 
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, because dot on Windows does not seem
-# to support this out of the box.
-#
-# Warning: Depending on the platform used, enabling this option may lead to
-# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
-# read).
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_TRANSPARENT        = NO
-
 # Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
 # files in one run (i.e. multiple -o and -T options on the command line). This
 # makes dot run faster, but since only newer versions of dot (>1.8.10) support
@@ -2444,14 +2746,34 @@ DOT_MULTI_TARGETS      = NO
 # If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
 # explaining the meaning of the various boxes and arrows in the dot generated
 # graphs.
+# Note: This tag requires that UML_LOOK isn't set, i.e. the doxygen internal
+# graphical representation for inheritance and collaboration diagrams is used.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
 GENERATE_LEGEND        = YES
 
-# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate
 # files that are used to generate the various graphs.
+#
+# Note: This setting is not only used for dot files but also for msc temporary
+# files.
 # The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_CLEANUP            = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. If the MSCGEN_TOOL tag is left empty (the default), then doxygen will
+# use a built-in version of mscgen tool to produce the charts. Alternatively,
+# the MSCGEN_TOOL tag can also specify the name an external tool. For instance,
+# specifying prog as the value, doxygen will call the tool as prog -T
+# <outfile_format> -o <outputfile> <inputfile>. The external tool should support
+# output file formats "png", "eps", "svg", and "ismap".
+
+MSCGEN_TOOL            =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
diff --git a/docs/index.rst b/docs/index.rst
index 30ef672f84..6d46eb49b1 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -8,31 +8,36 @@
 Composable Kernel User Guide
 ********************************************************************
 
-The Composable Kernel (CK) library provides a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs and CPUs, through general purpose kernel languages like HIP C++. This document contains instructions for installing, using, and contributing to the Composable Kernel project. To learn more see :ref:`what-is-ck`.
+The Composable Kernel library provides a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs and CPUs, through general purpose kernel languages such as `HIP C++ <https://rocm.docs.amd.com/projects/HIP/en/latest/index.html>`_.
 
-The CK documentation is structured as follows:
+The Composable Kernel repository is located at `https://github.com/ROCm/composable_kernel <https://github.com/ROCm/composable_kernel>`_.
 
 .. grid:: 2
   :gutter: 3
 
-  .. grid-item-card:: Installation
+  .. grid-item-card:: Install
 
-    * :ref:`docker-hub`
+    * :doc:`Composable Kernel prerequisites <./install/Composable-Kernel-prerequisites>`
+    * :doc:`Build and install Composable Kernel <./install/Composable-Kernel-install>`
+    * :doc:`Build and install Composable Kernel on a Docker image <./install/Composable-Kernel-Docker>`
 
   .. grid-item-card:: Conceptual
 
-    * :ref:`what-is-ck`
+    * :doc:`Composable Kernel structure <./conceptual/Composable-Kernel-structure>`
+    * :doc:`Composable Kernel mathematical basis <./conceptual/Composable-Kernel-math>`
 
-  .. grid-item-card:: API reference
+  .. grid-item-card:: Tutorials
 
-    * :ref:`supported-primitives`
+    * :doc:`Composable Kernel examples and tests <./tutorial/Composable-Kernel-examples>`
+
+  .. grid-item-card:: Reference
+
+    * :doc:`Composable Kernel supported scalar types <./reference/Composable_Kernel_supported_scalar_types>`
+    * :doc:`Composable Kernel custom types <./reference/Composable_Kernel_custom_types>`
+    * :doc:`Composable Kernel vector utilities <./reference/Composable_Kernel_vector_utilities>`
     * :ref:`api-reference`
     * :ref:`wrapper`
 
-  .. grid-item-card:: Tutorial
-
-    * :ref:`hello-world`
-
 To contribute to the documentation refer to `Contributing to ROCm  <https://rocm.docs.amd.com/en/latest/contribute/contributing.html>`_.
 
 You can find licensing information on the `Licensing <https://rocm.docs.amd.com/en/latest/about/license.html>`_ page.
diff --git a/docs/install/Composable-Kernel-Docker.rst b/docs/install/Composable-Kernel-Docker.rst
new file mode 100644
index 0000000000..d40cc2bff5
--- /dev/null
+++ b/docs/install/Composable-Kernel-Docker.rst
@@ -0,0 +1,16 @@
+.. meta::
+  :description: Composable Kernel docker files
+  :keywords: composable kernel, CK, ROCm, API, docker
+
+.. _docker-hub:
+
+********************************************************************
+Composable Kernel Docker containers
+********************************************************************
+
+Docker images that include all the required prerequisites for building Composable Kernel are available on `Docker Hub <https://hub.docker.com/r/rocm/composable_kernel/tags>`_.
+
+The images also contain `ROCm <https://rocm.docs.amd.com/en/latest/index.html>`_, `CMake <https://cmake.org/getting-started/>`_, and the `ROCm LLVM compiler infrastructure <https://rocm.docs.amd.com/projects/llvm-project/en/latest/index.html>`_.
+
+Composable Kernel Docker images are named according to their operating system and ROCm version. For example, a Docker image named ``ck_ub22.04_rocm6.3`` would correspond to an Ubuntu 22.04 image with ROCm 6.3. 
+
diff --git a/docs/install/Composable-Kernel-install.rst b/docs/install/Composable-Kernel-install.rst
new file mode 100644
index 0000000000..61b1fe0fcb
--- /dev/null
+++ b/docs/install/Composable-Kernel-install.rst
@@ -0,0 +1,72 @@
+.. meta::
+  :description: Composable Kernel build and install
+  :keywords: composable kernel, CK, ROCm, API, documentation, install
+
+******************************************************
+Building and installing Composable Kernel with CMake
+******************************************************
+
+Before you begin, clone the `Composable Kernel GitHub repository <https://github.com/ROCm/composable_kernel.git>`_ and create a ``build`` directory in its root:
+
+.. code:: shell
+
+  git clone https://github.com/ROCm/composable_kernel.git
+  cd composable_kernel
+  mkdir build
+
+Change directory to the ``build`` directory and generate the makefile using the ``cmake`` command. Two build options are required:
+
+* ``CMAKE_PREFIX_PATH``: The ROCm installation path. ROCm is installed in ``/opt/rocm`` by default.
+* ``CMAKE_CXX_COMPILER``: The path to the Clang compiler. Clang is found at ``/opt/rocm/llvm/bin/clang++`` by default.
+
+
+.. code:: shell
+
+  cd build
+  cmake ../. -D CMAKE_PREFIX_PATH="/opt/rocm" -D CMAKE_CXX_COMPILER="/opt/rocm/llvm/bin/clang++" [-D<OPTION1=VALUE1> [-D<OPTION2=VALUE2>] ...]
+
+
+Other build options are:
+
+* ``DISABLE_DL_KERNELS``: Set this to "ON" to not build deep learning (DL) and data parallel primitive (DPP) instances. 
+
+  .. note::
+
+      DL and DPP instances are useful on architectures that don't support XDL or WMMA.
+
+* ``CK_USE_FP8_ON_UNSUPPORTED_ARCH``: Set to ``ON`` to build FP8 data type instances on gfx90a without native FP8 support.
+* ``GPU_TARGETS``: Target architectures. Target architectures in this list must all be different versions of the same architectures. Enclose the list of targets in quotation marks. Separate multiple targets with semicolons (``;``). For example, ``cmake -D GPU_TARGETS="gfx908;gfx90a"``. This option is required to build tests and examples.
+* ``GPU_ARCHS``: Target architectures. Target architectures in this list are not limited to different versions of the same architectures. Enclose the list of targets in quotation marks. Separate multiple targets with semicolons (``;``). For example, ``cmake -D GPU_TARGETS="gfx908;gfx1100"``.
+* ``CMAKE_BUILD_TYPE``: The build type. Can be ``None``, ``Release``, ``Debug``, ``RelWithDebInfo``, or ``MinSizeRel``. CMake will use ``Release`` by default.
+
+.. Note::
+
+  If neither ``GPU_TARGETS`` nor ``GPU_ARCHS`` is specified, Composable Kernel will be built for all targets supported by the compiler.
+
+Build Composable Kernel using the generated makefile. This will build the library, the examples, and the tests, and save them to ``bin``.
+
+.. code:: shell
+
+    make -j20
+
+The ``-j`` option speeds up the build by using multiple threads in parallel. For example, ``-j20`` uses twenty threads in parallel. On average, each thread will use 2GB of memory. Make sure that the number of threads you use doesn't exceed the available memory in your system.
+
+Using ``-j`` alone will launch an unlimited number of threads and is not recommended.
+
+Install the Composable Kernel library:
+
+.. code:: shell
+  
+  make install
+
+After running ``make install``, the Composable Kernel files will be saved to the following locations:
+
+* Library files: ``/opt/rocm/lib/``
+* Header files: ``/opt/rocm/include/ck/`` and ``/opt/rocm/include/ck_tile/``
+* Examples, tests, and ckProfiler: ``/opt/rocm/bin/``
+
+For information about ckProfiler, see `the ckProfiler readme file <https://github.com/ROCm/composable_kernel/blob/develop/profiler/README.md>`_.
+
+For information about running the examples and tests, see :doc:`Composable Kernel examples and tests <../tutorial/Composable-Kernel-examples>`.
+
+
diff --git a/docs/install/Composable-Kernel-prerequisites.rst b/docs/install/Composable-Kernel-prerequisites.rst
new file mode 100644
index 0000000000..10be849ea6
--- /dev/null
+++ b/docs/install/Composable-Kernel-prerequisites.rst
@@ -0,0 +1,32 @@
+.. meta::
+  :description: Composable Kernel prerequisites
+  :keywords: composable kernel, CK, ROCm, API, documentation, prerequisites
+
+******************************************************
+Composable Kernel prerequisites
+******************************************************
+
+Docker images that include all the required prerequisites for building Composable Kernel are available on `Docker Hub <https://hub.docker.com/r/rocm/composable_kernel/tags>`_.
+
+The following prerequisites are required to build and install Composable Kernel:
+
+* cmake    
+* hip-rocclr    
+* iputils-ping
+* jq 
+* libelf-dev
+* libncurses5-dev  
+* libnuma-dev  
+* libpthread-stubs0-dev  
+* llvm-amdgpu  
+* mpich  
+* net-tools  
+* python3  
+* python3-dev  
+* python3-pip  
+* redis  
+* rocm-llvm-dev  
+* zlib1g-dev  
+* libzstd-dev  
+* openssh-server  
+* clang-format-12  
diff --git a/docs/install/dockerhub.rst b/docs/install/dockerhub.rst
deleted file mode 100644
index 87eb5a4f81..0000000000
--- a/docs/install/dockerhub.rst
+++ /dev/null
@@ -1,101 +0,0 @@
-.. meta::
-  :description: Composable Kernel documentation and API reference library
-  :keywords: composable kernel, CK, ROCm, API, documentation
-
-.. _docker-hub:
-
-********************************************************************
-CK Docker Hub
-********************************************************************
-
-Why do I need this?
-===================
-
-To make things simpler, and bring Composable Kernel and its dependencies together, 
-docker images can be found on `Docker Hub <https://hub.docker.com/r/rocm/composable_kernel/tags>`_. Docker images provide a complete image of the OS, the Composable Kernel library, and its dependencies in a single downloadable file. 
-
-Refer to `Docker Overview <https://docs.docker.com/get-started/overview/>`_ for more information on Docker images and containers.
-
-Which image is right for me?
-============================
-
-The image naming includes information related to the docker image. 
-For example ``ck_ub20.04_rocm6.0`` indicates the following:
-
-* ``ck`` - made for running Composable Kernel;
-* ``ub20.04`` - based on Ubuntu 20.04;
-* ``rocm6.0`` - ROCm platform version 6.0.
-
-Download a docker image suitable for your OS and ROCm release, run or start the docker container, and then resume the tutorial from this point. Use the ``docker pull`` command to download the file::
-
-    docker pull rocm/composable_kernel:ck_ub20.04_rocm6.0
-
-
-What is inside the image?
--------------------------
-
-The docker images have everything you need for running CK including:
-
-* `ROCm <https://rocm.docs.amd.com/en/latest/index.html>`_
-* `CMake <https://cmake.org/getting-started/>`_
-* `Compiler <https://github.com/ROCm/llvm-project>`_
-* `Composable Kernel library <https://github.com/ROCm/composable_kernel>`_
-
-Running the docker container
-============================
-
-After downloading the docker image, you can start the container using one of a number of commands. Start with the ``docker run`` command as shown below::
-
-    docker run                                                            \
-    -it                                                                   \
-    --privileged                                                          \
-    --group-add sudo                                                      \
-    -w /root/workspace                                                    \
-    -v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                         \
-    rocm/composable_kernel:ck_ub20.04_rocm6.0                             \
-    /bin/bash
-
-After starting the bash shell, the docker container current folder is `~/workspace`. The library path is ``~/workspace/composable_kernel``. Navigate to the library to begin the tutorial as explained in :ref:`hello-world`:
-
-.. note::
-
-    If your current folder is different from `${HOME}`, adjust the line ``-v ${HOME}:/root/workspace`` in the ``docker run`` command to fit your folder structure.
-
-Stop and restart the docker image
-=================================
-
-After finishing the tutorial, or just when you have completed your work session, you can close the docker container, or stop the docker container to restart it at another time. Closing the docker container means that it is still in the active state, and can be resumed from where you left it. Stopping the container closes it, and returns the image to its initial state. 
-
-Use the ``Ctrl-D`` option to exit the container, while leaving it active, so you can return to the container in its current state to resume the tutorial, or pickup your project where you left off. 
-
-To restart the active container use the ``docker exec`` command to specify the container name and options as follows::
-
-    docker exec -it <container_name> bash
-
-Where: 
-
-* `exec` is the docker command
-* `-it` is the interactive option for `exec`
-* `<container_name>` specifies an active container on the system
-* `bash` specifies the command to run in the interactive shell
-
-.. note::
-
-    You can use the ``docker container ls`` command to list the active containers on the system.
-
-To start a container from the image, use the ``docker start`` command::
-
-    docker start <container_name>
-
-Then use the docker exec command as shown above to start the bash shell. 
-
-Use the ``docker stop`` command to stop the container and restore the image to its initial state::
-
-    docker stop <container_name>
-    
-Editing the docker image
-=======================
-
-If you want to customize the docker image, edit the
-`Dockerfile <https://github.com/ROCm/composable_kernel/blob/develop/Dockerfile>`_
-from the GitHub repository to suit your needs.
diff --git a/docs/reference/API_Reference_Guide.rst b/docs/reference/Composable-Kernel-API-reference.rst
similarity index 79%
rename from docs/reference/API_Reference_Guide.rst
rename to docs/reference/Composable-Kernel-API-reference.rst
index 0d2d41c1eb..b6ee9f7790 100644
--- a/docs/reference/API_Reference_Guide.rst
+++ b/docs/reference/Composable-Kernel-API-reference.rst
@@ -5,26 +5,20 @@
 .. _api-reference:
 
 ********************************************************************
-API reference guide
+Composable Kernel API reference guide
 ********************************************************************
 
-
-This document contains details of the APIs for the Composable Kernel (CK) library and introduces
-some of the key design principles that are used to write new classes that extend CK functionality.
+This document contains details of the APIs for the Composable Kernel library and introduces some of the key design principles that are used to write new classes that extend the functionality of the Composable Kernel library.
 
 =================
-CK Datatypes
-=================
-
------------------
 DeviceMem
------------------
+=================
 
 .. doxygenstruct:: DeviceMem
 
----------------------------
+=============================
 Kernels For Flashattention
----------------------------
+=============================
 
 The Flashattention algorithm is defined in :cite:t:`dao2022flashattention`. This section lists
 the classes that are used in the CK GPU implementation of Flashattention.
diff --git a/docs/reference/wrapper.rst b/docs/reference/Composable-Kernel-wrapper.rst
similarity index 88%
rename from docs/reference/wrapper.rst
rename to docs/reference/Composable-Kernel-wrapper.rst
index 190fbcd445..4baa8d2b64 100644
--- a/docs/reference/wrapper.rst
+++ b/docs/reference/Composable-Kernel-wrapper.rst
@@ -1,20 +1,15 @@
 .. meta::
-  :description: Composable Kernel documentation and API reference library
-  :keywords: composable kernel, CK, ROCm, API, documentation
+  :description: Composable Kernel wrapper
+  :keywords: composable kernel, CK, ROCm, API, wrapper
 
 .. _wrapper:
 
 ********************************************************************
-Wrapper
+Composable Kernel wrapper
 ********************************************************************
 
--------------------------------------
-Description
--------------------------------------
 
-
-The CK library provides a lightweight wrapper for more complex operations implemented in 
-the library.
+The Composable Kernel library provides a lightweight wrapper to simplify the more complex operations.
 
 Example:
 
diff --git a/docs/reference/Composable_Kernel_custom_types.rst b/docs/reference/Composable_Kernel_custom_types.rst
new file mode 100644
index 0000000000..863d4131b9
--- /dev/null
+++ b/docs/reference/Composable_Kernel_custom_types.rst
@@ -0,0 +1,39 @@
+.. meta::
+  :description: Composable Kernel supported custom types
+  :keywords: composable kernel, custom, data types, support, CK, ROCm
+
+******************************************************
+Composable Kernel custom data types
+******************************************************
+
+Composable Kernel supports the use of custom types that provide a way to implement specialized numerical formats.
+
+To use custom types, a C++ type that implements the necessary operations for tensor computations needs to be created. These should include:
+
+* Constructors and initialization methods
+* Arithmetic operators if the type will be used in computational operations
+* Any conversion functions needed to interface with other parts of an application
+
+For example, to create a complex half-precision type:
+
+.. code:: cpp
+
+    struct complex_half_t
+    {
+        half_t real;
+        half_t img;
+    };
+
+    struct complex_half_t
+    {
+        using type = half_t;
+        type real;
+        type img;
+
+        complex_half_t() : real{type{}}, img{type{}} {}
+        complex_half_t(type real_init, type img_init) : real{real_init}, img{img_init} {}
+    };
+
+Custom types can be particularly useful for specialized applications such as complex number arithmetic,
+custom quantization schemes, or domain-specific number representations.
+
diff --git a/docs/reference/Composable_Kernel_supported_scalar_types.rst b/docs/reference/Composable_Kernel_supported_scalar_types.rst
new file mode 100644
index 0000000000..7ea1a9eaeb
--- /dev/null
+++ b/docs/reference/Composable_Kernel_supported_scalar_types.rst
@@ -0,0 +1,69 @@
+.. meta::
+  :description: Composable Kernel supported scalar types
+  :keywords: composable kernel, scalar, data types, support, CK, ROCm
+
+***************************************************
+Composable Kernel supported scalar data types
+***************************************************
+
+The Composable Kernel library provides support for the following scalar data types:
+
+.. list-table::
+    :header-rows: 1
+    :widths: 25 15 60
+
+    * - Type
+      - Bit Width
+      - Description
+
+    * - ``double``
+      - 64-bit
+      - Standard IEEE 754 double precision floating point
+
+    * - ``float``
+      - 32-bit
+      - Standard IEEE 754 single precision floating point
+
+    * - ``int32_t``
+      - 32-bit
+      - Standard signed 32-bit integer
+
+    * - ``int8_t``
+      - 8-bit
+      - Standard signed 8-bit integer
+
+    * - ``uint8_t``
+      - 8-bit
+      - Standard unsigned 8-bit integer
+
+    * - ``bool``
+      - 1-bit
+      - Boolean type
+
+    * - ``ck::half_t``
+      - 16-bit
+      - IEEE 754 half precision floating point with 5 exponent bits, 10 mantissa bits, and 1 sign bit
+
+    * - ``ck::bhalf_t``
+      - 16-bit
+      - Brain floating point with 8 exponent bits, 7 mantissa bits, and 1 sign bit
+
+    * - ``ck::f8_t``
+      - 8-bit
+      - 8-bit floating point (E4M3 format) with 4 exponent bits, 3 mantissa bits, and 1 sign bit
+
+    * - ``ck::bf8_t``
+      - 8-bit
+      - 8-bit brain floating point (E5M2 format) with 5 exponent bits, 2 mantissa bits, and 1 sign bit
+
+    * - ``ck::f4_t``
+      - 4-bit
+      - 4-bit floating point format (E2M1 format) with 2 exponent bits, 1 mantissa bit, and 1 sign bit
+
+    * - ``ck::f6_t``
+      - 6-bit
+      - 6-bit floating point format (E2M3 format) with 2 exponent bits, 3 mantissa bits, and 1 sign bit
+
+    * - ``ck::bf6_t``
+      - 6-bit
+      - 6-bit brain floating point format (E3M2 format) with 3 exponent bits, 2 mantissa bits, and 1 sign bit
\ No newline at end of file
diff --git a/docs/reference/Composable_Kernel_vector_utilities.rst b/docs/reference/Composable_Kernel_vector_utilities.rst
new file mode 100644
index 0000000000..3103653191
--- /dev/null
+++ b/docs/reference/Composable_Kernel_vector_utilities.rst
@@ -0,0 +1,16 @@
+.. meta::
+  :description: Composable Kernel supported precision types and custom type support
+  :keywords: composable kernel, precision, data types, ROCm
+
+******************************************************
+Composable Kernel vector template utilities
+******************************************************
+
+Composable Kernel includes template utilities for creating vector types with customizable widths. These template utilities also flatten nested vector types into a single, wider vector, preventing the creation of vectors of vectors.
+
+Vectors composed of supported scalar and custom types can be created with the ``ck::vector_type`` template.
+
+For example, ``ck::vector_type<float, 4>`` creates a vector composed of four floats and ``ck::vector_type<ck::half_t, 8>`` creates a vector composed of eight half-precision scalars.
+
+For vector operations to be valid, the underlying types must be either a :doc:`supported scalar type <Composable_Kernel_supported_scalar_types>` or :doc:`a custom type <Composable_Kernel_custom_types>` that implements the required operations.
+
diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in
index 533b81cd39..df98998224 100644
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -3,34 +3,43 @@ defaults:
 root: index
 subtrees:
 
-- caption: Conceptual
-  entries:
-  - file: conceptual/what-is-ck.rst
-    title: What is Composable Kernel?
-
 - caption: Install
   entries:
-  - file: install/dockerhub.rst
-    title: Docker Hub
+  - file: install/Composable-Kernel-prerequisites.rst
+    title: Composable Kernel prerequisites
+  - file: install/Composable-Kernel-install.rst
+    title: Build and install Composable Kernel
+  - file: install/Composable-Kernel-Docker.rst
+    title: Composable Kernel Docker images
 
-- caption: CK API Reference
+- caption: Conceptual
   entries:
-  - file: reference/Supported_Primitives_Guide.rst
-    title: Supported Primitives
-  - file: reference/API_Reference_Guide.rst
-    title: API Reference
-  - file: reference/wrapper.rst
-    title: Wrapper
+  - file: conceptual/Composable-Kernel-structure.rst
+    title: Composable Kernel structure
+  - file: conceptual/Composable-Kernel-math.rst
+    title: Composable Kernel mathematical basis
 
 - caption: Tutorial
   entries:
-  - file: tutorial/tutorial_hello_world.rst
-    title: Hello World Tutorial
+  - file: tutorial/Composable-Kernel-examples.rst
+    title: Composable Kernel examples
+
+- caption: Reference
+  entries:
+  - file: reference/Composable_Kernel_supported_scalar_types.rst
+    title: Composable Kernel scalar types
+  - file: reference/Composable_Kernel_custom_types.rst
+    title: Composable Kernel custom types
+  - file: reference/Composable_Kernel_vector_utilities.rst
+    title: Composable Kernel vector utilities
+  - file: reference/Composable-Kernel-API-reference.rst
+    title: Composable Kernel API reference
+  - file: reference/Composable-Kernel-wrapper.rst
+    title: Composable Kernel Wrapper
 
 - caption: About
   entries:
   - file: Contributors_Guide.rst
-    title: Contributing to CK
+    title: Contributing to Composable Kernel
   - file: license.rst
     title: License
-    
\ No newline at end of file
diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index d61b5e2b27..ac03e40939 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core==1.17.0
+rocm-docs-core[api_reference]==1.18.2
 sphinxcontrib-bibtex==2.6.3
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index 177f3ec184..3742eeebba 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -6,68 +6,79 @@
 #
 accessible-pygments==0.0.5
     # via pydata-sphinx-theme
-alabaster==0.7.16
+alabaster==1.0.0
     # via sphinx
 asttokens==3.0.0
     # via stack-data
-attrs==24.3.0
+attrs==25.3.0
     # via
     #   jsonschema
     #   jupyter-cache
     #   referencing
-babel==2.15.0
+babel==2.17.0
     # via
     #   pydata-sphinx-theme
     #   sphinx
-beautifulsoup4==4.12.3
+beautifulsoup4==4.13.4
     # via pydata-sphinx-theme
-breathe==4.35.0
+breathe==4.36.0
     # via rocm-docs-core
-certifi==2024.7.4
+certifi==2025.1.31
     # via requests
-cffi==1.16.0
+cffi==1.17.1
     # via
     #   cryptography
     #   pynacl
-charset-normalizer==3.3.2
+charset-normalizer==3.4.1
     # via requests
-click==8.1.7
+click==8.1.8
     # via
+    #   click-log
+    #   doxysphinx
     #   jupyter-cache
     #   sphinx-external-toc
+click-log==0.4.0
+    # via doxysphinx
 comm==0.2.2
     # via ipykernel
-cryptography==43.0.0
+contourpy==1.3.2
+    # via matplotlib
+cryptography==44.0.2
     # via pyjwt
-debugpy==1.8.12
+cycler==0.12.1
+    # via matplotlib
+debugpy==1.8.14
     # via ipykernel
-decorator==5.1.1
+decorator==5.2.1
     # via ipython
-deprecated==1.2.14
+deprecated==1.2.18
     # via pygithub
 docutils==0.21.2
     # via
-    #   breathe
     #   myst-parser
     #   pybtex-docutils
     #   pydata-sphinx-theme
     #   sphinx
     #   sphinxcontrib-bibtex
+doxysphinx==3.3.12
+    # via rocm-docs-core
 exceptiongroup==1.2.2
     # via ipython
-executing==2.1.0
+executing==2.2.0
     # via stack-data
-fastjsonschema==2.20.0
+fastjsonschema==2.21.1
     # via
     #   nbformat
     #   rocm-docs-core
-gitdb==4.0.11
+fonttools==4.57.0
+    # via matplotlib
+gitdb==4.0.12
     # via gitpython
-gitpython==3.1.43
+gitpython==3.1.44
     # via rocm-docs-core
-greenlet==3.1.1
+greenlet==3.2.1
     # via sqlalchemy
-idna==3.7
+idna==3.10
     # via requests
 imagesize==1.4.1
     # via sphinx
@@ -77,13 +88,13 @@ importlib-metadata==8.6.1
     #   myst-nb
 ipykernel==6.29.5
     # via myst-nb
-ipython==8.31.0
+ipython==8.35.0
     # via
     #   ipykernel
     #   myst-nb
 jedi==0.19.2
     # via ipython
-jinja2==3.1.4
+jinja2==3.1.6
     # via
     #   myst-parser
     #   sphinx
@@ -103,25 +114,35 @@ jupyter-core==5.7.2
     #   jupyter-client
     #   nbclient
     #   nbformat
+kiwisolver==1.4.8
+    # via matplotlib
 latexcodec==3.0.0
     # via pybtex
+libsass==0.22.0
+    # via doxysphinx
+lxml==5.2.1
+    # via doxysphinx
 markdown-it-py==3.0.0
     # via
     #   mdit-py-plugins
     #   myst-parser
-markupsafe==2.1.5
+markupsafe==3.0.2
     # via jinja2
+matplotlib==3.10.1
+    # via doxysphinx
 matplotlib-inline==0.1.7
     # via
     #   ipykernel
     #   ipython
-mdit-py-plugins==0.4.1
+mdit-py-plugins==0.4.2
     # via myst-parser
 mdurl==0.1.2
     # via markdown-it-py
-myst-nb==1.1.2
+mpire==2.10.2
+    # via doxysphinx
+myst-nb==1.2.0
     # via rocm-docs-core
-myst-parser==3.0.1
+myst-parser==4.0.1
     # via myst-nb
 nbclient==0.10.2
     # via
@@ -134,20 +155,28 @@ nbformat==5.10.4
     #   nbclient
 nest-asyncio==1.6.0
     # via ipykernel
-packaging==24.1
+numpy==1.26.4
+    # via
+    #   contourpy
+    #   doxysphinx
+    #   matplotlib
+packaging==25.0
     # via
     #   ipykernel
+    #   matplotlib
     #   pydata-sphinx-theme
     #   sphinx
 parso==0.8.4
     # via jedi
 pexpect==4.9.0
     # via ipython
-platformdirs==4.3.6
+pillow==11.2.1
+    # via matplotlib
+platformdirs==4.3.7
     # via jupyter-core
-prompt-toolkit==3.0.50
+prompt-toolkit==3.0.51
     # via ipython
-psutil==6.1.1
+psutil==7.0.0
     # via ipykernel
 ptyprocess==0.7.0
     # via pexpect
@@ -165,21 +194,30 @@ pydata-sphinx-theme==0.15.4
     # via
     #   rocm-docs-core
     #   sphinx-book-theme
-pygithub==2.3.0
+pygithub==2.6.1
     # via rocm-docs-core
-pygments==2.18.0
+pygments==2.19.1
     # via
     #   accessible-pygments
     #   ipython
+    #   mpire
     #   pydata-sphinx-theme
     #   sphinx
-pyjwt[crypto]==2.8.0
+pyjson5==1.6.8
+    # via doxysphinx
+pyjwt[crypto]==2.10.1
     # via pygithub
 pynacl==1.5.0
     # via pygithub
+pyparsing==3.2.3
+    # via
+    #   doxysphinx
+    #   matplotlib
 python-dateutil==2.9.0.post0
-    # via jupyter-client
-pyyaml==6.0.1
+    # via
+    #   jupyter-client
+    #   matplotlib
+pyyaml==6.0.2
     # via
     #   jupyter-cache
     #   myst-nb
@@ -187,11 +225,11 @@ pyyaml==6.0.1
     #   pybtex
     #   rocm-docs-core
     #   sphinx-external-toc
-pyzmq==26.2.0
+pyzmq==26.4.0
     # via
     #   ipykernel
     #   jupyter-client
-referencing==0.36.1
+referencing==0.36.2
     # via
     #   jsonschema
     #   jsonschema-specifications
@@ -199,23 +237,23 @@ requests==2.32.3
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core==1.17.0
+rocm-docs-core[api-reference]==1.18.2
     # via -r requirements.in
-rpds-py==0.22.3
+rpds-py==0.24.0
     # via
     #   jsonschema
     #   referencing
-six==1.16.0
+six==1.17.0
     # via
     #   pybtex
     #   python-dateutil
-smmap==5.0.1
+smmap==5.0.2
     # via gitdb
 snowballstemmer==2.2.0
     # via sphinx
-soupsieve==2.5
+soupsieve==2.7
     # via beautifulsoup4
-sphinx==7.4.7
+sphinx==8.1.3
     # via
     #   breathe
     #   myst-nb
@@ -228,15 +266,15 @@ sphinx==7.4.7
     #   sphinx-external-toc
     #   sphinx-notfound-page
     #   sphinxcontrib-bibtex
-sphinx-book-theme==1.1.3
+sphinx-book-theme==1.1.4
     # via rocm-docs-core
 sphinx-copybutton==0.5.2
     # via rocm-docs-core
-sphinx-design==0.6.0
+sphinx-design==0.6.1
     # via rocm-docs-core
 sphinx-external-toc==1.0.1
     # via rocm-docs-core
-sphinx-notfound-page==1.0.3
+sphinx-notfound-page==1.1.0
     # via rocm-docs-core
 sphinxcontrib-applehelp==2.0.0
     # via sphinx
@@ -252,18 +290,20 @@ sphinxcontrib-qthelp==2.0.0
     # via sphinx
 sphinxcontrib-serializinghtml==2.0.0
     # via sphinx
-sqlalchemy==2.0.37
+sqlalchemy==2.0.40
     # via jupyter-cache
 stack-data==0.6.3
     # via ipython
 tabulate==0.9.0
     # via jupyter-cache
-tomli==2.0.1
+tomli==2.2.1
     # via sphinx
 tornado==6.4.2
     # via
     #   ipykernel
     #   jupyter-client
+tqdm==4.67.1
+    # via mpire
 traitlets==5.14.3
     # via
     #   comm
@@ -274,21 +314,22 @@ traitlets==5.14.3
     #   matplotlib-inline
     #   nbclient
     #   nbformat
-typing-extensions==4.12.2
+typing-extensions==4.13.2
     # via
+    #   beautifulsoup4
     #   ipython
     #   myst-nb
     #   pydata-sphinx-theme
     #   pygithub
     #   referencing
     #   sqlalchemy
-urllib3==2.2.2
+urllib3==2.4.0
     # via
     #   pygithub
     #   requests
 wcwidth==0.2.13
     # via prompt-toolkit
-wrapt==1.16.0
+wrapt==1.17.2
     # via deprecated
 zipp==3.21.0
     # via importlib-metadata
diff --git a/docs/tutorial/Composable-Kernel-examples.rst b/docs/tutorial/Composable-Kernel-examples.rst
new file mode 100644
index 0000000000..62422d6f15
--- /dev/null
+++ b/docs/tutorial/Composable-Kernel-examples.rst
@@ -0,0 +1,40 @@
+.. meta::
+  :description: Composable Kernel examples and tests
+  :keywords: composable kernel, CK, ROCm, API, examples, tests
+
+********************************************************************
+Composable Kernel examples and tests
+********************************************************************
+
+After :doc:`building and installing Composable Kernel <../install/Composable-Kernel-install>`, the examples and tests will be moved to ``/opt/rocm/bin/``.
+
+All tests have the prefix ``test`` and all examples have the prefix ``example``.
+
+Use ``ctest`` with no arguments to run all examples and tests, or use ``ctest -R`` to run a single test. For example:
+
+.. code:: shell
+
+    ctest -R test_gemm_fp16
+
+Examples can be run individually as well. For example:
+
+.. code:: shell
+
+    ./bin/example_gemm_xdl_fp16 1 1 1
+
+For instructions on how to run individual examples and tests, see their README files in the |example|_ and |test|_ GitHub folders.
+
+To run smoke tests, use ``make smoke``.
+
+To run regression tests, use ``make regression``.
+
+In general, tests that run for under thirty seconds are included in the smoke tests and tests that run for over thirty seconds are included in the regression tests. 
+
+.. |example| replace:: ``example``
+.. _example: https://github.com/ROCm/composable_kernel/tree/develop/example
+
+.. |client_example| replace:: ``client_example``
+.. _client_example: https://github.com/ROCm/composable_kernel/tree/develop/client_example
+
+.. |test| replace:: ``test``
+.. _test: https://github.com/ROCm/composable_kernel/tree/develop/test
\ No newline at end of file
diff --git a/docs/tutorial/tutorial_hello_world.rst b/docs/tutorial/tutorial_hello_world.rst
deleted file mode 100644
index c31460785b..0000000000
--- a/docs/tutorial/tutorial_hello_world.rst
+++ /dev/null
@@ -1,165 +0,0 @@
-.. meta::
-  :description: Composable Kernel documentation and API reference library
-  :keywords: composable kernel, CK, ROCm, API, documentation
-
-.. _hello-world:
-
-********************************************************************
-Hello World Tutorial
-********************************************************************
-
-This tutorial is for engineers dealing with artificial intelligence and machine learning who
-would like to optimize pipelines and improve performance using the Composable
-Kernel (CK) library. This tutorial provides an introduction to the CK library. You will build the library and run some examples using a "Hello World" example. 
-
-Description
-===========
-
-Modern AI technology solves more and more problems in a variety of fields, but crafting fast and
-efficient workflows is still challenging. CK can make the AI workflow fast
-and efficient. CK is a collection of optimized AI operator kernels with tools to create
-new kernels. The library has components required for modern neural network architectures
-including matrix multiplication, convolution, contraction, reduction, attention modules, a variety of activation functions, and fused operators.
-
-CK library acceleration features are based on:
-
-* Layered structure
-* Tile-based computation model
-* Tensor coordinate transformation
-* Hardware acceleration use
-* Support of low precision data types including fp16, bf16, int8 and int4
-
-If you need more technical details and benchmarking results read the following 
-`blog post <https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224>`_.
-
-To download the library visit the `composable_kernel repository <https://github.com/ROCm/composable_kernel>`_.
-
-Hardware targets
-================
-
-CK library fully supports `gfx908` and `gfx90a` GPU architectures, while only some operators are
-supported for `gfx1030` devices. Check your hardware to determine the target GPU architecture.
-
-==========     =========
-GPU Target     AMD GPU
-==========     =========
-gfx908 	       Radeon Instinct MI100
-gfx90a 	       Radeon Instinct MI210, MI250, MI250X
-gfx1030        Radeon PRO V620, W6800, W6800X, W6800X Duo, W6900X, RX 6800, RX 6800 XT, RX 6900 XT, RX 6900 XTX, RX 6950 XT
-==========     =========
-
-There are also `cloud options <https://aws.amazon.com/ec2/instance-types/g4/>`_ you can find if
-you don't have an AMD GPU at hand.
-
-Build the library
-=================
-
-This tutorial is based on the use of docker images as explained in :ref:`docker-hub`. Download a docker image suitable for your OS and ROCm release, run or start the docker container, and then resume the tutorial from this point. 
-
-.. note::
-
-   You can also `install ROCm <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/>`_ on your system, clone the `Composable Kernel repository <https://github.com/ROCm/composable_kernel.git>`_ on GitHub, and use that to build and run the examples using the commands described below.
-
-Both the docker container and GitHub repository include the Composable Kernel library. Navigate to the library::
-
-    cd composable_kernel/
-
-Create and change to a ``build`` directory::
-
-    mkdir build && cd build
-
-The previous section discussed supported GPU architecture. Once you decide which hardware targets are needed, run CMake using the ``GPU_TARGETS`` flag::
-
-    cmake  \
-    -D CMAKE_PREFIX_PATH=/opt/rocm  \
-    -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc  \
-    -D CMAKE_CXX_FLAGS="-O3"  \
-    -D CMAKE_BUILD_TYPE=Release  \
-    -D BUILD_DEV=OFF  \
-    -D GPU_TARGETS="gfx908;gfx90a;gfx1030" ..
-
-If everything goes well the CMake command will return::
-
-    -- Configuring done
-    -- Generating done
-    -- Build files have been written to: "/root/workspace/composable_kernel/build"
-
-Finally, you can build examples and tests::
-
-    make -j examples tests
-
-When complete you should see::
-
-    Scanning dependencies of target tests
-    [100%] Built target tests
-
-Run examples and tests
-======================
-
-Examples are listed as test cases as well, so you can run all examples and tests with::
-
-    ctest
-
-You can check the list of all tests by running::
-
-    ctest -N
-
-You can also run examples separately as shown in the following example execution::
-
-    ./bin/example_gemm_xdl_fp16 1 1 1
-
-The arguments ``1 1 1`` mean that you want to run this example in the mode: verify results with CPU, initialize matrices with integers, and benchmark the kernel execution. You can play around with these parameters and see how output and execution results change.
-
-If you have a device based on `gfx908` or `gfx90a` architecture, and if the example runs as expected, you should see something like::
-
-    a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
-    b_k_n: dim 2, lengths {4096, 4096}, strides {4096, 1}
-    c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-    Perf: 1.08153 ms, 119.136 TFlops, 89.1972 GB/s, DeviceGemm_Xdl_CShuffle<Default, 256, 256, 128, 32, 8, 2, 32, 32, 4, 2, 8, 4, 1, 2> LoopScheduler: Interwave, PipelineVersion: v1
-
-However, running it on a `gfx1030` device should result in the following::
-
-    a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
-    b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
-    c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-    DeviceGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2> NumPrefetch: 1, LoopScheduler: Default, PipelineVersion: v1 does not support this problem
-
-Don't worry, some operators are supported on `gfx1030` architecture, so you can run a
-separate example like::
-
-    ./bin/example_gemm_dl_fp16 1 1 1
-
-and it should return something like::
-
-    a_m_k: dim 2, lengths {3840, 4096}, strides {1, 4096}
-    b_k_n: dim 2, lengths {4096, 4096}, strides {4096, 1}
-    c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-    arg.a_grid_desc_k0_m0_m1_k1_{2048, 3840, 2}
-    arg.b_grid_desc_k0_n0_n1_k1_{2048, 4096, 2}
-    arg.c_grid_desc_m_n_{ 3840, 4096}
-    launch_and_time_kernel: grid_dim {960, 1, 1}, block_dim {256, 1, 1}
-    Warm up 1 time
-    Start running 10 times...
-    Perf: 3.65695 ms, 35.234 TFlops, 26.3797 GB/s, DeviceGemmDl<256, 128, 128, 16, 2, 4, 4, 1>
-
-.. note::
-
-    A new CMake flag ``DL_KERNELS`` has been added to the latest versions of CK. If you do not see the above results when running ``example_gemm_dl_fp16``, you might need to add ``-D DL_KERNELS=ON`` to your CMake command to build the operators supported on the `gfx1030` architecture.
-
-You can also run a separate test::
-
-    ctest -R test_gemm_fp16
-
-If everything goes well you should see something like::
-
-    Start 121: test_gemm_fp16
-    1/1 Test #121: test_gemm_fp16 ...................   Passed   51.81 sec
-
-    100% tests passed, 0 tests failed out of 1
-
-Summary
-=======
-
-In this tutorial you took the first look at the Composable Kernel library, built it on your system and ran some examples and tests. In the next tutorial you will run kernels with different configurations to find out the best one for your hardware and task.
-
-P.S.: If you are running on a cloud instance, don't forget to switch off the cloud instance. 
diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt
index 97ac21eba5..96678d275a 100755
--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -28,16 +28,29 @@ add_example_executable(example_gemm_xdl_fp16_v3 gemm_xdl_fp16_v3.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_v3)
 add_example_executable(example_gemm_xdl_fp8_v3 gemm_xdl_fp8_v3.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_v3)
+
 add_example_executable(example_gemm_xdl_fp16_fp8_v3 gemm_xdl_fp16_fp8_v3.cpp)
-add_example_executable(example_gemm_xdl_fp16_pk_i4_v3 gemm_xdl_fp16_pk_i4_v3.cpp)
-add_example_executable(example_gemm_xdl_fp16_pk_i4_v3_b_scale gemm_xdl_fp16_pk_i4_v3_b_scale.cpp)
-add_example_executable(example_gemm_xdl_bf16_pk_i4_v3 gemm_xdl_bf16_pk_i4_v3.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8_v3)
+
+
+add_example_executable(example_gemm_xdl_fp16_fp8_streamk_v3 gemm_xdl_fp16_fp8_streamk_v3.cpp)
+add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8_streamk_v3)
+
 add_example_executable(example_gemm_xdl_bf16_v3 gemm_xdl_bf16_v3.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16_v3)
 
-add_example_executable(example_gemm_xdl_bf16_streamk_v3 gemm_xdl_bf16_streamk_v3.cpp)
-add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16_streamk_v3)
+list(APPEND gpu_list gfx942 gfx950)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+    if(gpu IN_LIST gpu_list AND target EQUAL 0)
+        add_example_executable(example_gemm_xdl_fp16_pk_i4_v3 gemm_xdl_fp16_pk_i4_v3.cpp)
+        add_example_executable(example_gemm_xdl_fp16_pk_i4_v3_b_scale gemm_xdl_fp16_pk_i4_v3_b_scale.cpp)
+        add_example_executable(example_gemm_xdl_bf16_pk_i4_v3 gemm_xdl_bf16_pk_i4_v3.cpp)
+        add_example_executable(example_gemm_xdl_fp8_pk_i4_bpreshuffle_v3 gemm_xdl_fp8_pk_i4_bpreshuffle_v3.cpp)
+        add_example_executable(example_gemm_xdl_fp8_pk_i4_v3 gemm_xdl_fp8_pk_i4_v3.cpp)
+        set(target 1)
+    endif()
+endforeach()
 
 add_example_executable(example_gemm_xdl_wavelet_fp16 gemm_xdl_wavelet_fp16.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_wavelet_fp16)
@@ -61,7 +74,7 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp64)
 
 add_example_executable(example_gemm_xdl_streamk gemm_xdl_streamk.cpp)
 
-list(APPEND gpu_list gfx90a gfx940 gfx941 gfx942 gfx950)
+list(APPEND gpu_list gfx90a gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
     if(gpu IN_LIST gpu_list AND target EQUAL 0)
@@ -70,6 +83,12 @@ foreach(gpu IN LISTS GPU_TARGETS)
 
         add_example_executable(example_gemm_xdl_lds_direct_load_fp16 gemm_xdl_lds_direct_load_fp16.cpp)
         add_example_dependencies(example_gemm_xdl example_gemm_xdl_lds_direct_load_fp16)
+
+        add_example_executable(example_gemm_xdl_bf16_streamk_v3 gemm_xdl_bf16_streamk_v3.cpp)
+        add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16_streamk_v3)
+
+        add_example_executable(example_gemm_xdl_fp8_streamk_v3 gemm_xdl_fp8_streamk_v3.cpp)
+        add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_streamk_v3)
         set(target 1)
     endif()
 endforeach()
@@ -80,9 +99,6 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8)
 add_example_executable(example_gemm_xdl_fp8_bf8 gemm_xdl_fp8_bf8.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_bf8)
 
-add_example_executable(example_gemm_xdl_fp8_streamk_v3 gemm_xdl_fp8_streamk_v3.cpp)
-add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_streamk_v3)
-
 add_example_executable(example_gemm_xdl_fp16_fp8 gemm_xdl_fp16_fp8.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8)
 
diff --git a/example/01_gemm/common.hpp b/example/01_gemm/common.hpp
index 9664c50b6e..9073ffcfc1 100644
--- a/example/01_gemm/common.hpp
+++ b/example/01_gemm/common.hpp
@@ -7,6 +7,7 @@
 #include <iostream>
 #include <initializer_list>
 #include <numeric>
+#include <unordered_map>
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
@@ -369,3 +370,25 @@ inline __host__ __device__ constexpr double get_atol()
         return 1e-3;
     }
 }
+
+float i4_to_f32_gfx9(uint8_t i4)
+{
+    static std::unordered_map<uint8_t, float> u = {{0b1000, -0.5000f},
+                                                   {0b1001, -0.4375f},
+                                                   {0b1010, -0.3750f},
+                                                   {0b1011, -0.3125f},
+                                                   {0b1100, -0.2500f},
+                                                   {0b1101, -0.1875f},
+                                                   {0b1110, -0.1250f},
+                                                   {0b1111, -0.0625f},
+                                                   {0b0, +0.0000f},
+                                                   {0b1, +0.0625f},
+                                                   {0b10, +0.1250f},
+                                                   {0b11, +0.1875f},
+                                                   {0b100, +0.2500f},
+                                                   {0b101, +0.3125f},
+                                                   {0b110, +0.3750f},
+                                                   {0b111, +0.4375f}};
+
+    return u[i4];
+}
diff --git a/example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp b/example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp
index 7b491173a6..7178ad46b9 100644
--- a/example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp
+++ b/example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp
@@ -133,7 +133,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
     std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
 
     DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize() / 2);
     DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
 
     // weight permute
@@ -199,6 +199,13 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
         return true;
     }
 
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    {
+        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+
+        return true;
+    }
+
     bool pass = true;
     if(config.do_verification)
     {
diff --git a/example/01_gemm/gemm_xdl_fp16_fp8_streamk_v3.cpp b/example/01_gemm/gemm_xdl_fp16_fp8_streamk_v3.cpp
new file mode 100644
index 0000000000..bd38eb17ee
--- /dev/null
+++ b/example/01_gemm/gemm_xdl_fp16_fp8_streamk_v3.cpp
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp"
+
+using ADataType        = ck::half_t;
+using BDataType        = ck::f8_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::half_t;
+using CDataType        = ck::half_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmV2_Streamk_Instance = 
+    ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_Streamk_V3<
+        ALayout,   BLayout,  CLayout,   
+        ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, 
+        AElementOp, BElementOp, CElementOp, GemmDefault, 
+        64,
+        16, 16, 
+        256, 8, 16,
+        16,   16,
+        1,    1, 
+        S<32, 2, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
+        2, 8, 8, 0,
+        S<16, 4, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
+        2, 16, 16, 0,
+        1, 1, S<1, 16, 1, 4>, 4,
+        ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1>;
+// clang-format on
+
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        CDataType,
+                                                                        AccDataType,
+                                                                        PassThrough,
+                                                                        PassThrough,
+                                                                        PassThrough>;
+
+#include "run_gemm_example_streamk_v2.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_universal_streamk_example(argc, argv); }
diff --git a/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp b/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
index e8a3064de6..e16f184a20 100644
--- a/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
@@ -134,7 +134,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
     std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
 
     DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize() / 2);
     DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
 
     // weight permute
@@ -249,6 +249,13 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
         return true;
     }
 
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    {
+        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+
+        return true;
+    }
+
     bool pass = true;
     if(config.do_verification)
     {
diff --git a/example/01_gemm/gemm_xdl_fp16_pk_i4_v3_b_scale.cpp b/example/01_gemm/gemm_xdl_fp16_pk_i4_v3_b_scale.cpp
index c8a40baa8a..f83d479713 100644
--- a/example/01_gemm/gemm_xdl_fp16_pk_i4_v3_b_scale.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_pk_i4_v3_b_scale.cpp
@@ -161,7 +161,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
     std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
 
     DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize() / 2);
     DeviceMem b1_scale_device_buf(sizeof(BScaleDataType) * b1_k_n.mDesc.GetElementSpaceSize());
     DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
 
@@ -281,6 +281,13 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
         return true;
     }
 
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    {
+        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+
+        return true;
+    }
+
     bool pass = true;
     if(config.do_verification)
     {
diff --git a/example/01_gemm/gemm_xdl_fp8_pk_i4_bpreshuffle_v3.cpp b/example/01_gemm/gemm_xdl_fp8_pk_i4_bpreshuffle_v3.cpp
new file mode 100644
index 0000000000..266a1e9d3e
--- /dev/null
+++ b/example/01_gemm/gemm_xdl_fp8_pk_i4_bpreshuffle_v3.cpp
@@ -0,0 +1,358 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_preshuffle.hpp"
+
+using F8  = ck::f8_t;
+using I4  = ck::pk_i4_t;
+using F16 = ck::half_t;
+using F32 = float;
+
+using ADataType        = F8;
+using BDataType        = I4;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using CDataType        = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+static constexpr bool PermuteA = false;
+static constexpr bool PermuteB = false;
+
+// clang-format off
+#if 0
+using DeviceGemmV2Instance = 
+    ck::tensor_operation::device::DeviceGemm_Xdl_CShuffleV3_BPreshuffle<
+        ALayout,   BLayout,  CLayout,   
+        ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, 
+        AElementOp, BElementOp, CElementOp, GemmDefault, 
+        256,
+        128, 128,
+        256, 16, 32,
+        32,   32,
+        4,    1,
+        S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
+        2, 16, 16, 0,
+        S<4, 64, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
+        2, 32, 32, 0,
+        1, 1, S<1, 32, 1, 8>, 4,
+        ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, F8, F8, PermuteA, PermuteB>;
+        
+#else
+using DeviceGemmV2Instance = 
+    ck::tensor_operation::device::DeviceGemm_Xdl_CShuffleV3_BPreshuffle<
+        ALayout,   BLayout,  CLayout,   
+        ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, 
+        AElementOp, BElementOp, CElementOp, GemmDefault, 
+        256,
+        256, 256,
+        128, 16, 32,
+        32,   32,
+        4,    4,
+        S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
+        2, 16, 16, 0,
+        S<4, 64, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
+        2, 32, 32, 0,
+        1, 1, S<1, 32, 1, 8>, 8,
+        ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, F8, F8, PermuteA, PermuteB>;
+
+#endif
+// clang-format on
+
+template <typename ProblemType>
+bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto M       = problem_size.M;
+    auto N       = problem_size.N;
+    auto K       = problem_size.K;
+    auto StrideA = problem_size.StrideA;
+    auto StrideB = problem_size.StrideB;
+    auto StrideC = problem_size.StrideC;
+    auto KBatch  = problem_size.KBatch;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    auto f_get_default_stride =
+        [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
+            if(stride == -1)
+            {
+                // give a chance if stride is -1, return a default packed stride
+                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+                {
+                    return static_cast<std::size_t>(col);
+                }
+                else
+                {
+                    return static_cast<std::size_t>(row);
+                }
+            }
+            else
+                return static_cast<std::size_t>(stride);
+        };
+
+    StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
+    StrideB = f_get_default_stride(K, N, StrideB, BLayout{});
+    StrideC = f_get_default_stride(M, N, StrideC, CLayout{});
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<BDataType> b_k_n_preshuffled(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+
+    switch(config.init_method)
+    {
+    case 0:
+        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        break;
+    case 2:
+        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        break;
+    case 3:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+    }
+
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "b_k_n_preshuffled:" << b_k_n_preshuffled.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_preshuffled.mDesc.GetElementSpaceSize() /
+                               2);
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    // do GEMM
+    auto gemm = DeviceGemmV2Instance{};
+
+    // weight pre-shuffle
+    int KPack = 32; // int4 -> 32, fp8 -> 16, fp16 -> 8
+    int NLane = gemm.GetPreShuffleParameters();
+    int KLane = 64 / NLane;
+
+    int K0 = K / (KLane * KPack);
+    // K -> K0 KLane KPack
+    // N -> N0 NLane
+    // N, K -> N0 K0 KLane NLane KPack
+    int tempk;
+    for(int n = 0; n < N; ++n)
+    {
+        for(int k = 0; k < K; ++k)
+        {
+            int n0 = n / NLane;
+            int n1 = n % NLane;
+
+            int k0 = k / (KLane * KPack);
+            tempk  = k % (KLane * KPack);
+            int k1 = tempk / KPack;
+            int k2 = tempk % KPack;
+
+            int outputIndex = n0 * KPack * NLane * KLane * K0 + k0 * KPack * NLane * KLane +
+                              k1 * KPack * NLane + n1 * KPack + k2;
+
+            b_k_n_preshuffled(outputIndex) = b_k_n(n * K + k);
+        }
+    }
+
+    // vector pk_i4x4 permute
+    for(int i = 0; i < N; i++)
+    {
+        for(int j = 0; j < K; j += 8)
+        {
+            int input[8];
+
+            for(int k = 0; k < 4; k++)
+            {
+                int i4x2         = b_k_n_preshuffled(j + k * 2, i).data;
+                input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
+                input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
+            }
+
+            // permute 01234567->20643175
+            {
+                int hi   = input[2];
+                int lo   = input[0];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_preshuffled(j + 0, i) = i4x2;
+            }
+
+            {
+                int hi   = input[6];
+                int lo   = input[4];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_preshuffled(j + 2, i) = i4x2;
+            }
+
+            {
+                int hi   = input[3];
+                int lo   = input[1];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_preshuffled(j + 4, i) = i4x2;
+            }
+
+            {
+                int hi   = input[7];
+                int lo   = input[5];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_preshuffled(j + 6, i) = i4x2;
+            }
+        }
+    }
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n_preshuffled.mData.data());
+    DeviceMem workspace;
+
+    auto a_element_op = AElementOp{};
+    auto b_element_op = BElementOp{};
+    auto c_element_op = CElementOp{};
+
+    auto invoker   = gemm.MakeInvoker();
+    float ave_time = 0;
+
+    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+                                      static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+                                      static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+                                      M,
+                                      N,
+                                      K,
+                                      StrideA,
+                                      StrideB,
+                                      StrideC,
+                                      KBatch,
+                                      a_element_op,
+                                      b_element_op,
+                                      c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return true;
+    }
+
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    {
+        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+
+        return true;
+    }
+
+    bool pass = true;
+    if(config.do_verification)
+    {
+        Tensor<float> b_k_n_f32({K, N});
+
+        for(int n = 0; n < N; n++)
+        {
+            for(int k = 0; k < K; k++)
+            {
+                ck::pk_i4_t i4x2 = b_k_n(k, n).data;
+                uint8_t i4       = 0;
+
+                if(k % 2 == 1)
+                    i4 = (i4x2.data >> 0) & 0xf;
+                else
+                    i4 = (i4x2.data >> 4) & 0xf;
+
+                float v_b       = i4_to_f32_gfx9(i4);
+                b_k_n_f32(k, n) = v_b;
+            }
+        }
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                float,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                PassThrough,
+                                                                                PassThrough,
+                                                                                PassThrough>;
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n_f32, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        ave_time = invoker.Run(argument, StreamConfig{nullptr, false, 0});
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        pass &= ck::utils::check_err(c_m_n_device_result,
+                                     c_m_n_host_result,
+                                     "Error: Incorrect results!",
+                                     get_rtol<CDataType>(),
+                                     get_atol<CDataType>());
+    }
+
+    if(config.time_kernel)
+    {
+        ave_time =
+            invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 20, 50, true, 50});
+
+        std::size_t flop = 2_uz * M * N * K;
+        std::size_t num_btype =
+            sizeof(ADataType) * M * K +
+            sizeof(BDataType) * K * N /
+                (ck::is_same_v<ck::remove_cvref_t<BDataType>, ck::pk_i4_t> ? 2 : 1) +
+            sizeof(CDataType) * M * N;
+
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << gemm.GetTypeString() << std::endl;
+    }
+
+    return pass;
+}
+
+bool run_gemm_splitk_example(int argc, char* argv[])
+{
+    ProblemSizeSplitK problem_size;
+    ExecutionConfig config;
+
+    return parse_cmd_args(argc, argv, problem_size, config) && run_gemm(problem_size, config);
+}
+
+int main(int argc, char* argv[]) { return !run_gemm_splitk_example(argc, argv); }
diff --git a/example/01_gemm/gemm_xdl_fp8_pk_i4_v3.cpp b/example/01_gemm/gemm_xdl_fp8_pk_i4_v3.cpp
new file mode 100644
index 0000000000..0575314dff
--- /dev/null
+++ b/example/01_gemm/gemm_xdl_fp8_pk_i4_v3.cpp
@@ -0,0 +1,336 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp"
+
+using F8  = ck::f8_t;
+using I4  = ck::pk_i4_t;
+using F16 = ck::half_t;
+using F32 = float;
+
+using ADataType        = F8;
+using BDataType        = I4;
+using AccDataType      = float;
+using CShuffleDataType = F16;
+using CDataType        = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+static constexpr bool PermuteA         = false;
+static constexpr bool PermuteB         = true;
+static constexpr ck::index_t KPerBlock = 128;
+
+// clang-format off
+using DeviceGemmV2Instance = 
+    ck::tensor_operation::device::DeviceGemm_Xdl_CShuffleV3<
+        ALayout,   BLayout,  CLayout,   
+        ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, 
+        AElementOp, BElementOp, CElementOp, GemmDefault, 
+        256,
+        128, 128,
+        KPerBlock, 16, 32,
+        32,   32,
+        2,    2,
+        S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
+        2, 16, 16, 0,
+        S<4, 64, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
+        2, 32, 32, 0,
+        1, 1, S<1, 32, 1, 8>, 8,
+        ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v2, ADataType, ADataType, PermuteA, PermuteB>;
+
+// clang-format on
+
+template <typename ProblemType>
+bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto M       = problem_size.M;
+    auto N       = problem_size.N;
+    auto K       = problem_size.K;
+    auto StrideA = problem_size.StrideA;
+    auto StrideB = problem_size.StrideB;
+    auto StrideC = problem_size.StrideC;
+    auto KBatch  = problem_size.KBatch;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    auto f_get_default_stride =
+        [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
+            if(stride == -1)
+            {
+                // give a chance if stride is -1, return a default packed stride
+                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+                {
+                    return static_cast<std::size_t>(col);
+                }
+                else
+                {
+                    return static_cast<std::size_t>(row);
+                }
+            }
+            else
+                return static_cast<std::size_t>(stride);
+        };
+
+    StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
+    StrideB = f_get_default_stride(K, N, StrideB, BLayout{});
+    StrideC = f_get_default_stride(M, N, StrideC, CLayout{});
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<BDataType> b_k_n_permute(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+
+    switch(config.init_method)
+    {
+    case 0:
+        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        break;
+    case 2:
+        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        break;
+    case 3:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+    }
+
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize() / 2);
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    // weight permute
+    if constexpr(PermuteB)
+    {
+        int K1 = KPerBlock;
+        int K0 = K / KPerBlock;
+
+        // int K0, N, K1
+        for(int j = 0; j < K0; j++)
+        {
+            for(int i = 0; i < N; i++)
+            {
+                for(int jj = 0; jj < K1; jj++)
+                {
+                    b_k_n_permute(j * N * K1 + i * K1 + jj) = b_k_n(i * K + (j * K1 + jj));
+                }
+            }
+        }
+    }
+    else
+    {
+        for(int i = 0; i < N; i++)
+        {
+            for(int j = 0; j < K; j++)
+            {
+                b_k_n_permute(i * K + j) = b_k_n(i * K + j);
+            }
+        }
+    }
+
+    // vector pk_i4x4 permute
+    for(int i = 0; i < N; i++)
+    {
+        for(int j = 0; j < K; j += 8)
+        {
+            int input[8];
+
+            for(int k = 0; k < 4; k++)
+            {
+                int i4x2         = b_k_n_permute(j + k * 2, i).data;
+                input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
+                input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
+            }
+
+            // permute 01234567->20643175
+            {
+                int hi   = input[2];
+                int lo   = input[0];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_permute(j + 0, i) = i4x2;
+            }
+
+            {
+                int hi   = input[6];
+                int lo   = input[4];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_permute(j + 2, i) = i4x2;
+            }
+
+            {
+                int hi   = input[3];
+                int lo   = input[1];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_permute(j + 4, i) = i4x2;
+            }
+
+            {
+                int hi   = input[7];
+                int lo   = input[5];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_permute(j + 6, i) = i4x2;
+            }
+        }
+    }
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n_permute.mData.data());
+    DeviceMem workspace;
+
+    auto a_element_op = AElementOp{};
+    auto b_element_op = BElementOp{};
+    auto c_element_op = CElementOp{};
+
+    // do GEMM
+    auto gemm      = DeviceGemmV2Instance{};
+    auto invoker   = gemm.MakeInvoker();
+    float ave_time = 0;
+
+    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+                                      static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+                                      static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+                                      M,
+                                      N,
+                                      K,
+                                      StrideA,
+                                      StrideB,
+                                      StrideC,
+                                      KBatch,
+                                      a_element_op,
+                                      b_element_op,
+                                      c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return true;
+    }
+
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    {
+        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+
+        return true;
+    }
+
+    bool pass = true;
+    if(config.do_verification)
+    {
+        Tensor<float> b_k_n_f32({K, N});
+
+        for(int n = 0; n < N; n++)
+        {
+            for(int k = 0; k < K; k++)
+            {
+                ck::pk_i4_t i4x2 = b_k_n(k, n).data;
+                uint8_t i4       = 0;
+
+                if(k % 2 == 1)
+                    i4 = (i4x2.data >> 0) & 0xf;
+                else
+                    i4 = (i4x2.data >> 4) & 0xf;
+
+                float v_b       = i4_to_f32_gfx9(i4);
+                b_k_n_f32(k, n) = v_b;
+            }
+        }
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                float,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                PassThrough,
+                                                                                PassThrough,
+                                                                                PassThrough>;
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n_f32, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        ave_time = invoker.Run(argument, StreamConfig{nullptr, false, 0});
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        pass &= ck::utils::check_err(c_m_n_device_result,
+                                     c_m_n_host_result,
+                                     "Error: Incorrect results!",
+                                     get_rtol<CDataType>(),
+                                     get_atol<CDataType>());
+    }
+
+    if(config.time_kernel)
+    {
+        ave_time =
+            invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 20, 50, true, 50});
+
+        std::size_t flop = 2_uz * M * N * K;
+        std::size_t num_btype =
+            sizeof(ADataType) * M * K +
+            sizeof(BDataType) * K * N /
+                (ck::is_same_v<ck::remove_cvref_t<BDataType>, ck::pk_i4_t> ? 2 : 1) +
+            sizeof(CDataType) * M * N;
+
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << gemm.GetTypeString() << std::endl;
+    }
+
+    return pass;
+}
+
+bool run_gemm_splitk_example(int argc, char* argv[])
+{
+    ProblemSizeSplitK problem_size;
+    ExecutionConfig config;
+
+    return parse_cmd_args(argc, argv, problem_size, config) && run_gemm(problem_size, config);
+}
+
+int main(int argc, char* argv[]) { return !run_gemm_splitk_example(argc, argv); }
diff --git a/example/01_gemm/gemm_xdl_streamk.cpp b/example/01_gemm/gemm_xdl_streamk.cpp
old mode 100755
new mode 100644
index 01542c4775..41665a79b7
--- a/example/01_gemm/gemm_xdl_streamk.cpp
+++ b/example/01_gemm/gemm_xdl_streamk.cpp
@@ -27,22 +27,24 @@ using DeviceGemmStreamK = ck::tensor_operation::device::DeviceGemmXdlStreamK
 // ######|      Type|      Type|      Type|        Type|        |        |        | Elementwise| Elementwise| Elementwise|   Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 // ######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|       |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 // ######|          |          |          |            |        |        |        |            |            |            |       |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-    < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    256,   256,   128,     4, 16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,      S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
-#else // defined(CK_USE_AMD_MFMA_GFX950)
     < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,      S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
 
           // < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    256,   256,   128,     4,   8,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,         1,           1,           1,               S<1, 32, 1, 8>,              8>;
       //  < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    128,   32,   64,     4,   8,  32,   32,    1,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>;
        // < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    128,   32,   128,     4,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,         1,           1,           1,               S<1, 32, 1, 4>,              8>;
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
 
+// instance for double rate mfma instruction on gfx950
+using DeviceGemmStreamK2 = ck::tensor_operation::device::DeviceGemmXdlStreamK
+// ######|     AData|     BData|     CData|     AccData| ALayout| BLayout| CLayout|           A|           B|           C|  Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+// ######|      Type|      Type|      Type|        Type|        |        |        | Elementwise| Elementwise| Elementwise|   Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+// ######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|       |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+// ######|          |          |          |            |        |        |        |            |            |            |       |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+    < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    256,   256,   128,     4, 16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,      S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
 
-
-// // clang-format on
 // clang-format on
 
-using DeviceGemmInstance = DeviceGemmStreamK;
+using DeviceGemmInstance  = DeviceGemmStreamK;
+using DeviceGemmInstance2 = DeviceGemmStreamK2;
 
 using ReferenceGemmInstance = ck::tensor_operation::host::
     ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
@@ -58,6 +60,6 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
                                                                              BElementOp,
                                                                              CElementOp>;
 
-#include "run_gemm_example.inc"
+#include "run_gemm_example_streamk.inc"
 
 int main(int argc, char* argv[]) { return !run_gemm_streamk_example(argc, argv); }
diff --git a/example/01_gemm/run_gemm_example.inc b/example/01_gemm/run_gemm_example.inc
index 4371af6244..c064ed500c 100644
--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -3,8 +3,6 @@
 
 #pragma once
 
-#include "ck/tensor_operation/gpu/device/device_gemm_streamk.hpp"
-
 template <typename ProblemType>
 bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 {
@@ -124,23 +122,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
     auto b_element_op = BElementOp{};
     auto c_element_op = CElementOp{};
 
-    using BaseStreamK = ck::tensor_operation::device::DeviceGemmStreamK<ALayout,
-                                                                        BLayout,
-                                                                        CLayout,
-                                                                        ADataType,
-                                                                        BDataType,
-                                                                        CDataType,
-                                                                        AElementOp,
-                                                                        BElementOp,
-                                                                        CElementOp>;
-
     // do GEMM
     auto gemm      = DeviceGemmInstance{};
     auto invoker   = gemm.MakeInvoker();
     float ave_time = 0;
 
-    if constexpr(std::is_same<ProblemType, ProblemSize>::value &&
-                 !std::is_base_of<BaseStreamK, DeviceGemmInstance>::value)
+    if constexpr(std::is_same<ProblemType, ProblemSize>::value)
     {
         auto argument = gemm.MakeArgument(
 #ifdef BUILD_INT4_EXAMPLE
@@ -171,61 +158,6 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 
         ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
     }
-    else if constexpr(std::is_same<ProblemType, ProblemSizeStreamK>::value &&
-                      std::is_base_of<BaseStreamK, DeviceGemmInstance>::value)
-    {
-        auto argument = gemm.MakeArgument(
-#ifdef BUILD_INT4_EXAMPLE
-            static_cast<KernelADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
-            static_cast<KernelBDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
-            static_cast<KernelCDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
-#else
-            static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
-            static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
-            static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
-#endif
-            M,
-            N,
-            K,
-            StrideA,
-            StrideB,
-            StrideC,
-            a_element_op,
-            b_element_op,
-            c_element_op,
-            problem_size.NumSKBlocks);
-
-        if(!gemm.IsSupportedArgument(argument))
-        {
-            std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
-
-            return true;
-        }
-
-        std::size_t workspace_size = gemm.GetWorkSpaceSize(&argument);
-        if(workspace_size != 0)
-        {
-            workspace.Realloc(workspace_size);
-            gemm.SetWorkSpacePointer(&argument, workspace.GetDeviceBuffer());
-        }
-
-        ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
-
-#if 0
-        // TODO!!!!!
-        if(workspace_size != 0){
-            float * ws_ptr = reinterpret_cast<float*>(malloc(workspace_size));
-            size_t ws_dwords = workspace_size / sizeof(float);
-            workspace.FromDevice(ws_ptr);
-
-            for(size_t i = 0; i < ws_dwords; i++) {
-                uint32_t rere = reinterpret_cast<uint32_t*>(ws_ptr)[i];
-                printf("%4lu : %f(0x%08x)\n", i, ws_ptr[i], rere);
-            }
-            free(ws_ptr);
-        }
-#endif
-    }
     else
     {
         // When the Problem Type and Problem Size does not fit.
@@ -319,11 +251,3 @@ bool run_gemm_example(int argc, char* argv[])
 
     return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
 }
-
-bool run_gemm_streamk_example(int argc, char* argv[])
-{
-    ProblemSizeStreamK problem_size;
-    ExecutionConfig config;
-
-    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
-}
diff --git a/example/01_gemm/run_gemm_example_streamk.inc b/example/01_gemm/run_gemm_example_streamk.inc
new file mode 100644
index 0000000000..438afcf71a
--- /dev/null
+++ b/example/01_gemm/run_gemm_example_streamk.inc
@@ -0,0 +1,270 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_streamk.hpp"
+
+template <typename ProblemType>
+bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
+{
+#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
+    static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
+#endif
+
+    using namespace ck::literals;
+
+    auto M       = problem_size.M;
+    auto N       = problem_size.N;
+    auto K       = problem_size.K;
+    auto StrideA = problem_size.StrideA;
+    auto StrideB = problem_size.StrideB;
+    auto StrideC = problem_size.StrideC;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    auto f_get_default_stride =
+        [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
+            if(stride == -1)
+            {
+                // give a chance if stride is -1, return a default packed stride
+                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+                {
+                    return static_cast<std::size_t>(col);
+                }
+                else
+                {
+                    return static_cast<std::size_t>(row);
+                }
+            }
+            else
+                return static_cast<std::size_t>(stride);
+        };
+
+    StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
+    StrideB = f_get_default_stride(K, N, StrideB, BLayout{});
+    StrideC = f_get_default_stride(M, N, StrideC, CLayout{});
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+
+    switch(config.init_method)
+    {
+    case 0:
+        ck::utils::FillConstant<ADataType>{ck::type_convert<ADataType>(1.f)}(a_m_k);
+        ck::utils::FillConstant<BDataType>{ck::type_convert<BDataType>(1.f)}(b_k_n);
+        break;
+    case 1:
+        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
+        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
+        break;
+    case 2:
+        ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
+        ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
+        break;
+    case 3:
+        ck::utils::FillUniformDistributionIntegerValue<ADataType>{1.f, 1.f}(a_m_k);
+        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
+        break;
+    case 4:
+        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
+        ck::utils::FillUniformDistributionIntegerValue<BDataType>{1.f, 1.f}(b_k_n);
+        break;
+    case 5:
+        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-2.f, 2.f}(a_m_k);
+        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-2.f, 2.f}(b_k_n);
+        break;
+    default:
+        ck::utils::FillUniformDistribution<ADataType>{-0.1f, 0.1f}(a_m_k);
+        ck::utils::FillUniformDistribution<BDataType>{-0.1f, 0.1f}(b_k_n);
+    }
+
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_ref_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+#ifdef BUILD_INT4_EXAMPLE
+    DeviceMem a_m_k_device_buf(sizeof(KernelADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(KernelBDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(KernelCDataType) *
+                               c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    const Tensor<KernelADataType> a_m_k_converted(a_m_k);
+    const Tensor<KernelBDataType> b_k_n_converted(b_k_n);
+
+    a_m_k_device_buf.ToDevice(a_m_k_converted.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n_converted.mData.data());
+#else
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_ref_buf(sizeof(CDataType) *
+                                   c_m_n_device_ref_result.mDesc.GetElementSpaceSize());
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+#endif
+    DeviceMem workspace;
+
+    auto a_element_op = AElementOp{};
+    auto b_element_op = BElementOp{};
+    auto c_element_op = CElementOp{};
+
+    using BaseStreamK = ck::tensor_operation::device::DeviceGemmStreamK<ALayout,
+                                                                        BLayout,
+                                                                        CLayout,
+                                                                        ADataType,
+                                                                        BDataType,
+                                                                        CDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        CElementOp>;
+
+    // do GEMM
+    static_assert(std::is_base_of<BaseStreamK, DeviceGemmInstance>::value &&
+                  std::is_base_of<BaseStreamK, DeviceGemmInstance2>::value);
+    auto gemm           = DeviceGemmInstance{};
+    auto gemm2          = DeviceGemmInstance2{}; // instance for double rate mfma instruction
+    BaseStreamK* op_ptr = (ck::get_device_name() == "gfx950") ? static_cast<BaseStreamK*>(&gemm2)
+                                                              : static_cast<BaseStreamK*>(&gemm);
+
+    float ave_time   = 0;
+    auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+    auto argument_ptr = op_ptr->MakeArgumentPointer(
+#ifdef BUILD_INT4_EXAMPLE
+        static_cast<KernelADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+        static_cast<KernelBDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+        static_cast<KernelCDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+#else
+        static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+        static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+        static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+#endif
+        M,
+        N,
+        K,
+        StrideA,
+        StrideB,
+        StrideC,
+        a_element_op,
+        b_element_op,
+        c_element_op,
+        problem_size.NumSKBlocks);
+
+    if(!op_ptr->IsSupportedArgument(argument_ptr.get()))
+    {
+        std::cerr << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+
+        return true;
+    }
+
+    auto argument              = argument_ptr.get();
+    std::size_t workspace_size = op_ptr->GetWorkSpaceSize(argument);
+    if(workspace_size != 0)
+    {
+        workspace.Realloc(workspace_size);
+        op_ptr->SetWorkSpacePointer(argument, workspace.GetDeviceBuffer());
+    }
+
+    ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, config.time_kernel});
+
+    std::size_t flop = 2_uz * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << op_ptr->GetTypeString() << std::endl;
+
+    bool pass = true;
+
+    if((config.do_verification == 1) || (config.do_verification == 3))
+    {
+        // CPU verification
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        std::cout << "Running verification on CPU." << std::endl;
+        ref_invoker.Run(ref_argument);
+
+#ifdef BUILD_INT4_EXAMPLE
+        Tensor<CDataType> c_m_n_device_result_converted(c_m_n_host_result.mDesc);
+
+        c_m_n_device_buf.FromDevice(c_m_n_device_result_converted.mData.data());
+
+        c_m_n_device_result = c_m_n_device_result_converted.CopyAsType<CDataType>();
+
+        return ck::utils::check_err(c_m_n_device_result_converted, c_m_n_host_result);
+#else
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        pass &= ck::utils::check_err(c_m_n_device_result,
+                                     c_m_n_host_result,
+                                     "Error: Incorrect results!",
+                                     get_rtol<CDataType>(),
+                                     get_atol<CDataType>());
+#endif
+    }
+
+    if((config.do_verification == 2) || (config.do_verification == 3))
+    {
+        // GPU verification
+        auto ref_gemm_gpu    = ReferenceGemmInstanceGPU{};
+        auto ref_invoker_gpu = ref_gemm_gpu.MakeInvoker();
+
+        auto ref_argument_gpu = ref_gemm_gpu.MakeArgument(
+            static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+            static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+            static_cast<CDataType*>(c_m_n_device_ref_buf.GetDeviceBuffer()),
+            M,
+            N,
+            K,
+            a_element_op,
+            b_element_op,
+            c_element_op);
+
+        std::cout << "Running verification on GPU." << std::endl;
+        ref_invoker_gpu.Run(ref_argument_gpu, StreamConfig{});
+
+        c_m_n_device_ref_buf.FromDevice(c_m_n_device_ref_result.mData.data());
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        pass &= ck::utils::check_err(c_m_n_device_result,
+                                     c_m_n_device_ref_result,
+                                     "Error: Incorrect results!",
+                                     get_rtol<CDataType>(),
+                                     get_atol<CDataType>());
+    }
+
+    return pass == true;
+}
+
+bool run_gemm_streamk_example(int argc, char* argv[])
+{
+    ProblemSizeStreamK problem_size;
+    ExecutionConfig config;
+
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
+}
diff --git a/example/04_gemm_add_add_fastgelu/CMakeLists.txt b/example/04_gemm_add_add_fastgelu/CMakeLists.txt
index aa9367cdcf..562936418b 100644
--- a/example/04_gemm_add_add_fastgelu/CMakeLists.txt
+++ b/example/04_gemm_add_add_fastgelu/CMakeLists.txt
@@ -16,7 +16,7 @@ if(USE_BITINT_EXTENSION_INT4)
     add_example_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int4)
 endif(USE_BITINT_EXTENSION_INT4)
 
-list(APPEND gpu_list gfx90a gfx940 gfx941 gfx942 gfx950)
+list(APPEND gpu_list gfx90a gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
     if(gpu IN_LIST gpu_list AND target EQUAL 0)
diff --git a/example/09_convnd_fwd/CMakeLists.txt b/example/09_convnd_fwd/CMakeLists.txt
index 8a295d14c4..91c072aef7 100644
--- a/example/09_convnd_fwd/CMakeLists.txt
+++ b/example/09_convnd_fwd/CMakeLists.txt
@@ -3,7 +3,6 @@ add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp)
 add_example_executable(example_convnd_fwd_xdl_bf16 convnd_fwd_xdl_bf16.cpp)
 add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp)
 add_example_executable(example_convnd_fwd_xdl_fp8 convnd_fwd_xdl_fp8.cpp)
-add_example_executable(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp)
 add_example_executable(example_convnd_fwd_xdl_bf8 convnd_fwd_xdl_bf8.cpp)
 add_example_executable(example_convnd_fwd_xdl_fp16_comp_fp8 convnd_fwd_xdl_fp16_comp_fp8.cpp)
 add_example_executable(example_convnd_fwd_xdl_fp8_bf8 convnd_fwd_xdl_fp8_bf8.cpp)
@@ -11,3 +10,13 @@ add_example_executable(example_convnd_fwd_xdl_bf8_fp8 convnd_fwd_xdl_bf8_fp8.cpp
 add_example_executable(example_convnd_fwd_dl_fp16 convnd_fwd_dl_fp16.cpp)
 add_example_executable(example_convnd_fwd_dl_fp32 convnd_fwd_dl_fp32.cpp)
 add_example_executable(example_convnd_fwd_dl_int8 convnd_fwd_dl_int8.cpp)
+
+# only build fp64 example for the following targets
+list(APPEND gpu_list gfx90a gfx942 gfx950)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+    if(gpu IN_LIST gpu_list AND target EQUAL 0)
+        add_example_executable(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp)
+        set(target 1)
+    endif()
+endforeach()
\ No newline at end of file
diff --git a/example/15_grouped_gemm/run_grouped_gemm_example.inc b/example/15_grouped_gemm/run_grouped_gemm_example.inc
index 64125cd1d0..7186c22233 100644
--- a/example/15_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/15_grouped_gemm/run_grouped_gemm_example.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -21,6 +21,7 @@ struct ExecutionConfig final
     bool do_verification = true;
     int init_method      = 1;
     bool time_kernel     = false;
+    bool async_hargs     = false;
 };
 
 bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
@@ -173,8 +174,10 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
 
     std::size_t workspace_size = gemm.GetWorkSpaceSize(&argument);
     std::size_t kargs_size     = gemm.GetDeviceKernelArgSize(&argument);
+    std::size_t hargs_size     = gemm.GetHostKernelArgSize(&argument);
 
     DeviceMem gemm_workspace, gemm_kargs;
+    void* gemm_hargs;
 
     // The following is necessary since TwoStage kernel is using additional memory both
     // for Workspace and kernel arguments.
@@ -188,6 +191,11 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
         gemm_workspace.Realloc(workspace_size);
         gemm.SetWorkSpacePointer(&argument, gemm_workspace.GetDeviceBuffer());
     }
+    if(config.async_hargs && hargs_size > 0)
+    {
+        hip_check_error(hipHostMalloc(&gemm_hargs, hargs_size));
+        gemm.SetHostKernelArgsPointer(&argument, gemm_hargs);
+    }
 
     if(!gemm.IsSupportedArgument(argument))
     {
@@ -196,7 +204,23 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
             "not support this GEMM problem");
     }
 
-    invoker.Run(argument, StreamConfig{nullptr, false});
+    if(!config.async_hargs)
+    {
+        invoker.Run(argument, StreamConfig{nullptr, false});
+    }
+    else
+    {
+        hipStream_t stream0 = nullptr;
+        hip_check_error(hipStreamCreate(&stream0));
+
+        hipEvent_t event0 = nullptr;
+        hip_check_error(hipEventCreate(&event0));
+
+        invoker.Run(argument, StreamConfig{nullptr, false}, stream0, event0);
+
+        hip_check_error(hipEventSynchronize(event0));
+        hip_check_error(hipStreamSynchronize(stream0));
+    }
 
     bool pass = true;
     if(config.do_verification)
@@ -264,18 +288,25 @@ bool run_grouped_gemm_example(int argc, char* argv[])
         problem_size.stride_Bs.push_back(problem_size.Ks[i]);
         problem_size.stride_Cs.push_back(problem_size.Ns[i]);
     }
-
     if(argc == 4)
     {
         config.do_verification = std::stoi(argv[1]);
         config.init_method     = std::stoi(argv[2]);
         config.time_kernel     = std::stoi(argv[3]);
     }
+    else if(argc == 5)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+        config.async_hargs     = std::stoi(argv[4]);
+    }
     else
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
         printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg4: async hargs (0=n0, 1=yes)\n");
         exit(0);
     }
 
diff --git a/example/18_batched_gemm_reduce/CMakeLists.txt b/example/18_batched_gemm_reduce/CMakeLists.txt
index 018b57f82c..03ba0a65df 100644
--- a/example/18_batched_gemm_reduce/CMakeLists.txt
+++ b/example/18_batched_gemm_reduce/CMakeLists.txt
@@ -1,4 +1,4 @@
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
+list(APPEND gpu_list gfx908 gfx90a gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
  if(gpu IN_LIST gpu_list AND target EQUAL 0)
diff --git a/example/24_batched_gemm/run_batched_gemm_example_fp16int4_b_scale.inc b/example/24_batched_gemm/run_batched_gemm_example_fp16int4_b_scale.inc
index 8c4913dbcc..3582bc5e33 100644
--- a/example/24_batched_gemm/run_batched_gemm_example_fp16int4_b_scale.inc
+++ b/example/24_batched_gemm/run_batched_gemm_example_fp16int4_b_scale.inc
@@ -212,7 +212,8 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
     std::cout << "c_g_m_n: " << c_g_m_n_host_result.mDesc << std::endl;
 
     DeviceMem a_g_m_k_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_g_k_n_device_buf(sizeof(BDataType) * b_g_k_n_permute.mDesc.GetElementSpaceSize());
+    DeviceMem b_g_k_n_device_buf(sizeof(BDataType) * b_g_k_n_permute.mDesc.GetElementSpaceSize() /
+                                 2);
     DeviceMem b1_g_scale_device_buf(sizeof(BScaleDataType) * b1_g_k_n.mDesc.GetElementSpaceSize());
     DeviceMem c_g_m_n_device_buf(sizeof(CDataType) *
                                  c_g_m_n_device_result.mDesc.GetElementSpaceSize());
diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc
index ce42a20be7..627e20e245 100644
--- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc
+++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc
@@ -32,9 +32,9 @@ using BiasLayout = typename LayoutSettingSelector<NDimSpatial>::BiasLayout;
 template <ck::index_t NDimSpatial>
 using ResidualLayout = typename LayoutSettingSelector<NDimSpatial>::ResidualLayout;
 
-#if defined(CK_USE_AMD_MFMA_GFX950)
+// instance for double rate mfma on gfx950 (vs gfx942)
 template <ck::index_t NDimSpatial>
-using DeviceConvFwdInstance =
+using DeviceConvFwdInstance2 =
     ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
         NDimSpatial,
         InputLayout<NDimSpatial>,
@@ -55,14 +55,14 @@ using DeviceConvFwdInstance =
         1,           //
         256,         // BlockSize
         128,         // MPerBlock
-        256,         // NPerBlock
+        64,          // NPerBlock
         64,          // KPerBlock
         16,          // AK1
         16,          // BK1
         32,          // MPerXdl
         32,          // NPerXdl
         2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        1,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -81,7 +81,7 @@ using DeviceConvFwdInstance =
         1,
         S<1, 16, 1, 16>,
         4>;
-#else  // defined(CK_USE_AMD_MFMA_GFX950)
+// instance for gfx942-
 template <ck::index_t NDimSpatial>
 using DeviceConvFwdInstance =
     ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
@@ -104,14 +104,14 @@ using DeviceConvFwdInstance =
         1,           //
         256,         // BlockSize
         128,         // MPerBlock
-        256,         // NPerBlock
-        16,          // KPerBlock
+        128,         // NPerBlock
+        32,          // KPerBlock
         4,           // AK1
         4,           // BK1
         32,          // MPerXdl
         32,          // NPerXdl
         2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        2,           // NXdlPerWave
         S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
         S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -130,7 +130,6 @@ using DeviceConvFwdInstance =
         1,
         S<1, 16, 1, 16>,
         4>;
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
 
 template <ck::index_t NDimSpatial>
 using HostConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
@@ -235,40 +234,67 @@ bool run_grouped_conv_fwd_bias_relu_add(const ExecutionConfig& config,
     copy(conv_param.input_right_pads_, input_right_pads);
 
     // do Conv
-    auto conv    = DeviceConvFwdInstance<NDimSpatial>{};
-    auto invoker = conv.MakeInvoker();
-    auto argument =
-        conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
-                          wei_device_buf.GetDeviceBuffer(),
-                          std::array<const void*, 2>{bias_device_buf.GetDeviceBuffer(),
-                                                     residual_device_buf.GetDeviceBuffer()},
-                          out_device_buf.GetDeviceBuffer(),
-                          a_g_n_c_wis_lengths,
-                          a_g_n_c_wis_strides,
-                          b_g_k_c_xs_lengths,
-                          b_g_k_c_xs_strides,
-                          std::array<std::array<ck::index_t, NDimSpatial + 3>, 2>{
-                              {d0_g_n_k_wos_lengths, d1_g_n_k_wos_lengths}},
-                          std::array<std::array<ck::index_t, NDimSpatial + 3>, 2>{
-                              {d0_g_n_k_wos_strides, d1_g_n_k_wos_strides}},
-                          e_g_n_k_wos_lengths,
-                          e_g_n_k_wos_strides,
-                          conv_filter_strides,
-                          conv_filter_dilations,
-                          input_left_pads,
-                          input_right_pads,
-                          InElementOp{},
-                          WeiElementOp{},
-                          OutElementOp{});
+    using BaseGroupedConvFwdMultipleABD =
+        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
+            NDimSpatial,
+            InputLayout<NDimSpatial>,
+            WeightLayout<NDimSpatial>,
+            ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>,
+            OutputLayout<NDimSpatial>,
+            InKernelDataType,
+            WeiKernelDataType,
+            ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
+            OutKernelDataType,
+            InElementOp,
+            WeiElementOp,
+            OutElementOp,
+            InKernelDataType,  // AComputeDataType
+            InKernelDataType>; // BComputeDataType
 
-    if(!conv.IsSupportedArgument(argument))
+    static_assert(
+        std::is_base_of<BaseGroupedConvFwdMultipleABD, DeviceConvFwdInstance<NDimSpatial>>::value &&
+        std::is_base_of<BaseGroupedConvFwdMultipleABD, DeviceConvFwdInstance2<NDimSpatial>>::value);
+
+    auto conv  = DeviceConvFwdInstance<NDimSpatial>{};  // instance for gfx942-
+    auto conv2 = DeviceConvFwdInstance2<NDimSpatial>{}; // instance for double rate mfma instruction
+                                                        // on gfx950
+    BaseGroupedConvFwdMultipleABD* op_ptr =
+        (ck::get_device_name() == "gfx950") ? static_cast<BaseGroupedConvFwdMultipleABD*>(&conv2)
+                                            : static_cast<BaseGroupedConvFwdMultipleABD*>(&conv);
+    auto invoker_ptr  = op_ptr->MakeInvokerPointer();
+    auto argument_ptr = op_ptr->MakeArgumentPointer(
+        in_device_buf.GetDeviceBuffer(),
+        wei_device_buf.GetDeviceBuffer(),
+        std::array<const void*, 2>{bias_device_buf.GetDeviceBuffer(),
+                                   residual_device_buf.GetDeviceBuffer()},
+        out_device_buf.GetDeviceBuffer(),
+        a_g_n_c_wis_lengths,
+        a_g_n_c_wis_strides,
+        b_g_k_c_xs_lengths,
+        b_g_k_c_xs_strides,
+        std::array<std::array<ck::index_t, NDimSpatial + 3>, 2>{
+            {d0_g_n_k_wos_lengths, d1_g_n_k_wos_lengths}},
+        std::array<std::array<ck::index_t, NDimSpatial + 3>, 2>{
+            {d0_g_n_k_wos_strides, d1_g_n_k_wos_strides}},
+        e_g_n_k_wos_lengths,
+        e_g_n_k_wos_strides,
+        conv_filter_strides,
+        conv_filter_dilations,
+        input_left_pads,
+        input_right_pads,
+        InElementOp{},
+        WeiElementOp{},
+        OutElementOp{});
+
+    if(!op_ptr->IsSupportedArgument(argument_ptr.get()))
     {
         throw std::runtime_error(
             "wrong! device_conv with the specified compilation parameters does "
             "not support this Conv problem");
     }
 
-    float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+    float avg_time =
+        invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, config.time_kernel});
 
     std::size_t flop      = conv_param.GetFlops();
     std::size_t num_btype = conv_param.GetByte<InUserDataType, WeiUserDataType, OutUserDataType>();
@@ -276,7 +302,7 @@ bool run_grouped_conv_fwd_bias_relu_add(const ExecutionConfig& config,
     float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
     float gb_per_sec = num_btype / 1.E6 / avg_time;
     std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
-              << conv.GetTypeString() << std::endl;
+              << op_ptr->GetTypeString() << std::endl;
 
     if(config.do_verification)
     {
diff --git a/example/62_convnd_activ/binary/CMakeLists.txt b/example/62_convnd_activ/binary/CMakeLists.txt
index 7c09177049..b9584be89c 100644
--- a/example/62_convnd_activ/binary/CMakeLists.txt
+++ b/example/62_convnd_activ/binary/CMakeLists.txt
@@ -1,4 +1,4 @@
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
+list(APPEND gpu_list gfx908 gfx90a gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
  if(gpu IN_LIST gpu_list AND target EQUAL 0)
diff --git a/example/62_convnd_activ/convinvscale/CMakeLists.txt b/example/62_convnd_activ/convinvscale/CMakeLists.txt
index 6eb7fb8ece..7aae090674 100644
--- a/example/62_convnd_activ/convinvscale/CMakeLists.txt
+++ b/example/62_convnd_activ/convinvscale/CMakeLists.txt
@@ -1,4 +1,4 @@
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
+list(APPEND gpu_list gfx908 gfx90a gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
  if(gpu IN_LIST gpu_list AND target EQUAL 0)
diff --git a/example/62_convnd_activ/convscale/CMakeLists.txt b/example/62_convnd_activ/convscale/CMakeLists.txt
index a52818e21e..26f6c1b168 100644
--- a/example/62_convnd_activ/convscale/CMakeLists.txt
+++ b/example/62_convnd_activ/convscale/CMakeLists.txt
@@ -1,4 +1,4 @@
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
+list(APPEND gpu_list gfx908 gfx90a gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
  if(gpu IN_LIST gpu_list AND target EQUAL 0)
diff --git a/example/62_convnd_activ/convscale_add/CMakeLists.txt b/example/62_convnd_activ/convscale_add/CMakeLists.txt
index f8bc13c8f7..b2e0eecb58 100644
--- a/example/62_convnd_activ/convscale_add/CMakeLists.txt
+++ b/example/62_convnd_activ/convscale_add/CMakeLists.txt
@@ -1,4 +1,4 @@
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
+list(APPEND gpu_list gfx908 gfx90a gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
  if(gpu IN_LIST gpu_list AND target EQUAL 0)
diff --git a/example/62_convnd_activ/convscale_reduce/CMakeLists.txt b/example/62_convnd_activ/convscale_reduce/CMakeLists.txt
index a794d68bb6..739c855ae4 100644
--- a/example/62_convnd_activ/convscale_reduce/CMakeLists.txt
+++ b/example/62_convnd_activ/convscale_reduce/CMakeLists.txt
@@ -1,4 +1,4 @@
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
+list(APPEND gpu_list gfx908 gfx90a gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
  if(gpu IN_LIST gpu_list AND target EQUAL 0)
diff --git a/example/62_convnd_activ/convscale_relu/CMakeLists.txt b/example/62_convnd_activ/convscale_relu/CMakeLists.txt
index a348e30a97..c3241aecf2 100644
--- a/example/62_convnd_activ/convscale_relu/CMakeLists.txt
+++ b/example/62_convnd_activ/convscale_relu/CMakeLists.txt
@@ -1,4 +1,4 @@
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
+list(APPEND gpu_list gfx908 gfx90a gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
  if(gpu IN_LIST gpu_list AND target EQUAL 0)
diff --git a/example/62_convnd_activ/dynamic_unary/CMakeLists.txt b/example/62_convnd_activ/dynamic_unary/CMakeLists.txt
index 21613b1ab3..8441030945 100644
--- a/example/62_convnd_activ/dynamic_unary/CMakeLists.txt
+++ b/example/62_convnd_activ/dynamic_unary/CMakeLists.txt
@@ -1,4 +1,4 @@
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
+list(APPEND gpu_list gfx908 gfx90a gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
  if(gpu IN_LIST gpu_list AND target EQUAL 0)
diff --git a/example/62_convnd_activ/multi_AB/CMakeLists.txt b/example/62_convnd_activ/multi_AB/CMakeLists.txt
index 1c865d4c95..149bd6f03e 100644
--- a/example/62_convnd_activ/multi_AB/CMakeLists.txt
+++ b/example/62_convnd_activ/multi_AB/CMakeLists.txt
@@ -1,4 +1,4 @@
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
+list(APPEND gpu_list gfx908 gfx90a gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
  if(gpu IN_LIST gpu_list AND target EQUAL 0)
diff --git a/example/62_convnd_activ/unary/CMakeLists.txt b/example/62_convnd_activ/unary/CMakeLists.txt
index 927b2e3341..36b4ffc9f4 100644
--- a/example/62_convnd_activ/unary/CMakeLists.txt
+++ b/example/62_convnd_activ/unary/CMakeLists.txt
@@ -1,4 +1,4 @@
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
+list(APPEND gpu_list gfx908 gfx90a gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
  if(gpu IN_LIST gpu_list AND target EQUAL 0)
diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt
index 2d00545515..5d2a097576 100644
--- a/example/65_gemm_multiply_multiply/CMakeLists.txt
+++ b/example/65_gemm_multiply_multiply/CMakeLists.txt
@@ -1,5 +1,27 @@
 add_example_executable(example_gemm_multiply_multiply_xdl_fp8 gemm_multiply_multiply_xdl_fp8.cpp)
 add_example_executable(example_gemm_multiply_multiply_xdl_fp8_ab_scale gemm_multiply_multiply_xdl_fp8_ab_scale.cpp)
 add_example_executable(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp)
+add_example_executable(example_gemm_multiply_multiply_xdl_fp16_bpreshuffle gemm_multiply_multiply_xdl_fp16_bpreshuffle.cpp)
 add_example_executable(example_gemm_add_add_xdl_fp16 gemm_add_add_xdl_fp16.cpp)
 add_example_executable(example_gemm_multiply_multiply_xdl_int8 gemm_multiply_multiply_xdl_int8.cpp)
+add_example_executable(example_moe_gemm1_xdl_fp8 moe_gemm1_xdl_fp8.cpp)
+add_example_executable(example_moe_gemm2_xdl_fp8 moe_gemm2_xdl_fp8.cpp)
+
+list(APPEND gpu_list gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+    if(gpu IN_LIST gpu_list AND target EQUAL 0)
+        add_example_executable(example_moe_gemm1_xdl_pk_i4 moe_gemm1_xdl_pk_i4.cpp)
+        add_example_executable(example_moe_gemm2_xdl_pk_i4 moe_gemm2_xdl_pk_i4.cpp)
+        if(CK_hip_VERSION VERSION_LESS_EQUAL 6.3.42132)
+            set(EXAMPLE_COMPILE_OPTIONS)
+            check_cxx_compiler_flag("-mllvm --amdgpu-enable-max-ilp-scheduling-strategy=1" HAS_MAX_ILP_SCHEDULING_STRATEGY)
+            if(HAS_MAX_ILP_SCHEDULING_STRATEGY)
+                list(APPEND EXAMPLE_COMPILE_OPTIONS -mllvm --amdgpu-enable-max-ilp-scheduling-strategy=1)
+            endif()
+            target_compile_options(example_moe_gemm1_xdl_pk_i4 PRIVATE ${EXAMPLE_COMPILE_OPTIONS})
+            target_compile_options(example_moe_gemm2_xdl_pk_i4 PRIVATE ${EXAMPLE_COMPILE_OPTIONS})
+        endif()
+        set(target 1)
+    endif()
+endforeach()
diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp16_bpreshuffle.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp16_bpreshuffle.cpp
new file mode 100644
index 0000000000..69803c7eeb
--- /dev/null
+++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp16_bpreshuffle.cpp
@@ -0,0 +1,371 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_b_preshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType       = F16;
+using B0DataType       = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using D0DataType       = F32;
+using D1DataType       = F32;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType>;
+using EDataType        = F16;
+
+using A0Layout = Row;
+using B0Layout = Col;
+using D0Layout = Row;
+using D1Layout = Col;
+using DsLayout = ck::Tuple<D0Layout, D1Layout>;
+using ELayout  = Row;
+
+struct MultiplyMultiply
+{
+    template <typename E, typename C, typename D0, typename D1>
+    __host__ __device__ constexpr void
+    operator()(E& e, const C& c, const D0& d0, const D1& d1) const;
+
+    template <>
+    __host__ __device__ constexpr void operator()<F16, float, float, float>(F16& e,
+                                                                            const float& c,
+                                                                            const float& d0,
+                                                                            const float& d1) const
+    {
+        const float x0_f = c * d0 * d1;
+
+        e = ck::type_convert<F16>(x0_f);
+    }
+
+    template <>
+    __host__ __device__ constexpr void operator()<BF16, float, float, float>(BF16& e,
+                                                                             const float& c,
+                                                                             const float& d0,
+                                                                             const float& d1) const
+    {
+        const float x0_f = c * d0 * d1;
+
+        e = ck::type_convert<BF16>(x0_f);
+    }
+
+    template <>
+    __host__ __device__ constexpr void operator()<ck::half_t, int, float, float>(
+        ck::half_t& e, const int& c, const float& d0, const float& d1) const
+    {
+        const float x0_f =
+            ck::type_convert<float>(c) * ck::type_convert<float>(d0) * ck::type_convert<float>(d1);
+
+        e = ck::type_convert<ck::half_t>(x0_f);
+    }
+
+    template <>
+    __host__ __device__ constexpr void operator()<ck::bhalf_t, int, float, float>(
+        ck::bhalf_t& e, const int& c, const float& d0, const float& d1) const
+    {
+        const float x0_f =
+            ck::type_convert<float>(c) * ck::type_convert<float>(d0) * ck::type_convert<float>(d1);
+
+        e = ck::type_convert<ck::bhalf_t>(x0_f);
+    }
+};
+
+void preShuffleBuffer(const F16* src, F16* dst, int N, int K, int NXdl)
+{
+    int KPack = 16 / sizeof(F16);
+    int NLane = NXdl;
+    int KLane = 64 / NLane;
+
+    int K0 = K / (KLane * KPack);
+    // K -> K0 KLane KPack
+    // N -> N0 NLane
+    // N, K -> N0 K0 KLane NLane KPack
+    int tempk;
+    for(int n = 0; n < N; ++n)
+    {
+        for(int k = 0; k < K; ++k)
+        {
+            int n0 = n / NLane;
+            int n1 = n % NLane;
+
+            int k0 = k / (KLane * KPack);
+            tempk  = k % (KLane * KPack);
+            int k1 = tempk / KPack;
+            int k2 = tempk % KPack;
+
+            int outputIndex = n0 * KPack * NLane * KLane * K0 + k0 * KPack * NLane * KLane +
+                              k1 * KPack * NLane + n1 * KPack + k2;
+
+            dst[outputIndex] = src[n * K + k];
+        }
+    }
+}
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = MultiplyMultiply;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShuffle_V3
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle
+    // clang-format off
+///######|  ALayout|  BLayout| DsLayout| ELayout|      AData|      BData|     DsData|     EData|     AccData|         CShuffle|           A|           B|          CDE|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+///######|         |         |         |        |       Type|       Type|       Type|      Type|        Type|         DataType| Elementwise| Elementwise|  Elementwise| Spacialization|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+///######|         |         |         |        |           |           |           |          |            |                 |   Operation|   Operation|    Operation|               |      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+///######|         |         |         |        |           |           |           |          |            |                 |            |            |             |               |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |    S<C, D0, D1>|
+///###### RCR
+        // kernel 1: 256->32x128x128 
+        <      Row,      Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
+               AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   256,
+               32,   128,    128,
+               8,   8,
+               32,   32,
+               1,    1,
+               S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0,
+               S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0,
+               1,    1,   S<1, 16, 1, 16>, S<8, 8, 1>,
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, F16>;
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = K;
+    ck::index_t StrideB = K;
+    ck::index_t StrideD = 0;
+    ck::index_t StrideE = N;
+
+    ck::index_t KBatch = 1;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 12)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideD = std::stoi(argv[9]);
+        StrideE = std::stoi(argv[10]);
+
+        KBatch = std::stoi(argv[11]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf(
+            "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE, KBatch\n");
+        exit(0);
+    }
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
+    Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
+    Tensor<B0DataType> b0_preshuffled(
+        f_host_tensor_descriptor(K, N, StrideB, B0Layout{})); // use laout only for size
+    Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD, D0Layout{}));
+    Tensor<D1DataType> d1_m_n(f_host_tensor_descriptor(M, N, StrideD, D1Layout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
+    std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl;
+    std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
+    std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{0, 2});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-2, 2});
+        d1_m_n.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-2, 2});
+        break;
+    case 2:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_1<D0DataType>{});
+        d1_m_n.GenerateTensorValue(GeneratorTensor_1<D1DataType>{});
+        break;
+    default:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
+        d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+    }
+    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a0_device_buf.ToDevice(a0_m_k.mData.data());
+    d0_device_buf.ToDevice(d0_m_n.mData.data());
+    d1_device_buf.ToDevice(d1_m_n.mData.data());
+    e_device_buf.ToDevice(e_m_n_device_result.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    constexpr ck::index_t NumDTensor = DsDataType::Size();
+
+    constexpr auto I0 = ck::Number<0>{};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+
+    int NPerXdl = device_op.GetPreShuffleParameters();
+
+    preShuffleBuffer(b0_k_n.mData.data(), b0_preshuffled.mData.data(), N, K, NPerXdl);
+
+    b0_device_buf.ToDevice(b0_preshuffled.mData.data());
+
+    auto invoker = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(a0_device_buf.GetDeviceBuffer(),
+                               b0_device_buf.GetDeviceBuffer(),
+                               std::array<const void*, NumDTensor>{d0_device_buf.GetDeviceBuffer(),
+                                                                   d1_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               M,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               std::array<ck::index_t, NumDTensor>{I0, I0},
+                               StrideE,
+                               KBatch,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel, 0, 50, 50, false, 1});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    if(do_verification)
+    {
+        invoker.Run(argument, StreamConfig{nullptr, false});
+
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+        Tensor<CShuffleDataType> c_m_n({M, N});
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<A0DataType,
+                                                                                B0DataType,
+                                                                                CShuffleDataType,
+                                                                                AccDataType,
+                                                                                PassThrough,
+                                                                                PassThrough,
+                                                                                PassThrough>;
+        auto ref_gemm               = ReferenceGemmInstance{};
+        auto ref_invoker            = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a0_m_k, b0_k_n, c_m_n, PassThrough{}, PassThrough{}, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n), d1_m_n(m, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(
+                   e_m_n_device_result, e_m_n_host_result, "Error: Incorrect results!", 1e-3, 5e-2)
+                   ? 0
+                   : 1;
+    }
+
+    return 0;
+}
diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8.cpp
index c33fe357d8..352d373ae5 100644
--- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8.cpp
+++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8.cpp
@@ -76,13 +76,13 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShu
          <Row, Col, DsLayout, ELayout,
           A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
           AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   256,
-          128,   128,   128,
+          144,   128,   128,
+          8,   16,
           16,   16,
-          16,   16,
-          4,    4,
+          9,    2,
+          S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0,
           S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
-          S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
-          1, 2, S<1, 32, 1, 8>, S<8, 8, 1>,
+          1, 2, S<1, 16, 1, 16>, S<8, 8, 1>,
           ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, FP8>;
 // clang-format on
 
diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp
index 9b7849a654..b54ba5ddfb 100644
--- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp
+++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp
@@ -55,7 +55,7 @@ using CDEElementOp = PassThrough;
 
 static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
 
-static constexpr ck::index_t Scale_Block_M = 128;
+static constexpr ck::index_t Scale_Block_M = 1;
 static constexpr ck::index_t Scale_Block_N = 128;
 static constexpr ck::index_t Scale_Block_K = 128;
 
@@ -65,14 +65,14 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_ABScale_
           A0DataType, A1DataType, B0DataType, B1DataType, DsDataType, EDataType, AccDataType, CShuffleDataType, 
           AElementOp,  BElementOp, CDEElementOp, GemmSpec,
           256, Scale_Block_M, Scale_Block_N, Scale_Block_K,
-          128, 128,
-          128, 16, 16,
+          16, 128,
+          256, 16, 16,
           16,   16,
-          4,    4,
-          S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
-          S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
-          1,    2,  S<1, 32, 1, 8>,  S<8, 8, 1>,
-          ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, FP8>;
+          1,    2,
+          S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
+          S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
+          1,    2,  S<1, 16, 1, 16>,  S<8>,
+          ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, FP8>;
 // clang-format on
 
 int main(int argc, char* argv[])
@@ -80,11 +80,12 @@ int main(int argc, char* argv[])
     bool do_verification = true;
     int init_method      = 1;
     bool time_kernel     = false;
+    bool flush_cache     = true;
 
     // GEMM shape
-    ck::index_t M = 3840;
-    ck::index_t N = 4096;
-    ck::index_t K = 4096;
+    ck::index_t M = 128;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
 
     ck::index_t StrideA = K;
     ck::index_t StrideB = K;
@@ -100,7 +101,7 @@ int main(int argc, char* argv[])
         init_method     = std::stoi(argv[2]);
         time_kernel     = std::stoi(argv[3]);
     }
-    else if(argc == 10)
+    else if(argc == 8)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
@@ -110,16 +111,19 @@ int main(int argc, char* argv[])
         N = std::stoi(argv[5]);
         K = std::stoi(argv[6]);
 
-        StrideA = std::stoi(argv[7]);
-        StrideB = std::stoi(argv[8]);
-        StrideE = std::stoi(argv[9]);
+        flush_cache = std::stoi(argv[7]);
+
+        StrideA = K;
+        StrideB = K;
+        StrideE = N;
     }
     else
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
         printf("arg3: time kernel (0=no, 1=yes)\n");
-        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE\n");
+        printf("arg4 to 6: M, N, K\n");
+        printf("arg7: flush both I$ and L2$ (0=no, 1=yes)\n");
         exit(0);
     }
 
@@ -182,9 +186,15 @@ int main(int argc, char* argv[])
         b1_k_n.GenerateTensorValue(GeneratorTensor_1<B1DataType>{});
         break;
     case 4:
-        a0_m_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
-        b0_k_n.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
         a1_m_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0, 1.0});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_1<B1DataType>{});
+        break;
+    case 5:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_m_k.GenerateTensorValue(GeneratorTensor_1<A1DataType>{});
         b1_k_n.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
         break;
     default:
@@ -194,6 +204,16 @@ int main(int argc, char* argv[])
         b1_k_n.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
     }
 #endif
+#if 0
+    for(int im =0; im< (M + Scale_Block_M - 1) / Scale_Block_M; im++){
+        float row_sum = .0;
+        for(int ik =0; ik< (K + Scale_Block_K - 1) / Scale_Block_K; ik++){
+            printf("%lf ",a1_m_k(im, ik));
+            row_sum += a1_m_k(im, ik);
+        }
+        printf("sum: %lf\n", row_sum * 128);
+    }
+#endif
 
     DeviceMem a0_device_buf(sizeof(A0DataType) * a0_m_k.mDesc.GetElementSpaceSize());
     DeviceMem a1_device_buf(sizeof(A1DataType) * a1_m_k.mDesc.GetElementSpaceSize());
@@ -239,12 +259,24 @@ int main(int argc, char* argv[])
             "not support this GEMM problem");
     }
 
-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel, 20, 50});
-
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_btype =
         sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(EDataType) * M * N;
 
+    float ave_time = .0;
+
+    if(flush_cache)
+    {
+        int rotating_buf = (512 * 1024 * 1024 + num_btype - 1) / num_btype;
+
+        ave_time = invoker.Run(argument,
+                               StreamConfig{nullptr, time_kernel, 0, 50, 100, true, rotating_buf});
+    }
+    else
+    {
+        ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel, 0, 50, 100});
+    }
+
     float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
 
     float gb_per_sec = num_btype / 1.E6 / ave_time;
diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
index 9a81ef5ea7..e4e6a4f1a7 100644
--- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
+++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
@@ -140,14 +140,14 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShu
     // clang-format off
     <   Row, Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
         AElementOp,  BElementOp, CDEElementOp, GemmSpec, 256,
-        256,   256,    128,
+        128,   128,    128,
         16,   16,
-        16,   16,
-        8,    8,
+        32,   32,
+        2,    2,
         S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
         S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
-        1,    2,   S<1, 32, 1, 8>, S<8, 8, 1>,
-        ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, FP8>;
+        1,    1,   S<1, 32, 1, 8>, S<8, 8, 1>,
+        ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, FP8>;
 // clang-format on
 
 int main(int argc, char* argv[])
diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
new file mode 100644
index 0000000000..3b31460953
--- /dev/null
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
@@ -0,0 +1,484 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F8  = ck::f8_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType       = F8;
+using B0DataType       = F8;
+using EDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = EDataType;
+using D0DataType       = F32;
+using D1DataType       = F32;
+using D2DataType       = F32;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType, D2DataType>;
+
+using A0Layout = Row;
+using B0Layout = Col;
+using ELayout  = Row;
+using D0Layout = Row;
+using D1Layout = Col;
+using D2Layout = ELayout;
+using DsLayout = ck::Tuple<D0Layout, D1Layout, D2Layout>;
+
+// for gate, a_scale, b_scale
+struct MulABScale
+{
+    template <typename E, typename C, typename D0, typename D1>
+    __host__ __device__ constexpr void
+    operator()(E& e, const C& c, const D0& d0, const D1& d1) const;
+
+    template <>
+    __host__ __device__ constexpr void operator()<EDataType, float, float, float>(
+        EDataType& e, const float& c, const float& d0, const float& d1) const
+    {
+        (void)d0;
+        (void)d1;
+        e = ck::type_convert<EDataType>(c);
+    }
+    template <>
+    __host__ __device__ constexpr void operator()<EDataType, EDataType, float, float>(
+        EDataType& e, const EDataType& c, const float& d0, const float& d1) const
+    {
+        (void)d0;
+        (void)d1;
+        e = ck::type_convert<EDataType>(c);
+    }
+    template <>
+    __host__ __device__ constexpr void operator()<EDataType, EDataType, EDataType, EDataType>(
+        EDataType& e, const EDataType& c, const EDataType& d0, const EDataType& d1) const
+    {
+        (void)d0;
+        (void)d1;
+        e = ck::type_convert<EDataType>(c);
+    }
+};
+
+struct MulABScaleExpertWeight
+{
+    template <typename E, typename C, typename D0, typename D1, typename D2>
+    __host__ __device__ constexpr void
+    operator()(E& e, const C& c, const D0& d0, const D1& d1, const D2& d2) const;
+    // for real kernel use
+    template <>
+    __host__ __device__ constexpr void operator()<EDataType, float, float, float, float>(
+        EDataType& e, const float& c, const float& d0, const float& d1, const float& d2) const
+    {
+        (void)d0;
+        (void)d1;
+        (void)d2;
+        e = ck::type_convert<EDataType>(c);
+    }
+    template <>
+    __host__ __device__ constexpr void operator()<EDataType, EDataType, float, float, float>(
+        EDataType& e, const EDataType& c, const float& d0, const float& d1, const float& d2) const
+    {
+        (void)d0;
+        (void)d1;
+        (void)d2;
+        e = ck::type_convert<EDataType>(c);
+    }
+    // for reference cpu
+    template <>
+    __host__ __device__ constexpr void operator()<float, float, float, float, float>(
+        float& e, const float& c, const float& d0, const float& d1, const float& d2) const
+    {
+        // for reference cpu
+        (void)d0;
+        (void)d1;
+        (void)d2;
+        e = ck::type_convert<EDataType>(c);
+    }
+};
+
+using CDEElementOp = MulABScaleExpertWeight;
+
+void preShuffleBuffer(const B0DataType* src, B0DataType* dst, int N, int K, int NXdl)
+{
+    int KPack = 16 / sizeof(B0DataType);
+    int NLane = NXdl;
+    int KLane = 64 / NLane;
+
+    int K0 = K / (KLane * KPack);
+    // K -> K0 KLane KPack
+    // N -> N0 NLane
+    // N, K -> N0 K0 KLane NLane KPack
+    int tempk;
+    for(int n = 0; n < N; ++n)
+    {
+        for(int k = 0; k < K; ++k)
+        {
+            int n0 = n / NLane;
+            int n1 = n % NLane;
+
+            int k0 = k / (KLane * KPack);
+            tempk  = k % (KLane * KPack);
+            int k1 = tempk / KPack;
+            int k2 = tempk % KPack;
+
+            int outputIndex = n0 * KPack * NLane * KLane * K0 + k0 * KPack * NLane * KLane +
+                              k1 * KPack * NLane + n1 * KPack + k2;
+
+            dst[outputIndex] = src[n * K + k];
+        }
+    }
+}
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+
+static constexpr auto GemmSpec         = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr ck::index_t MPerBlock = 128;
+static constexpr ck::index_t MXDLPerWave = 4;
+static constexpr ck::index_t NXDLPerWave = 2;
+static constexpr ck::index_t BLOCKSIZE   = 256;
+static constexpr ck::index_t NPerBlock   = 64;
+static constexpr ck::index_t MNPerXDL    = 16;
+static constexpr ck::index_t KPerBlock   = 128 / sizeof(A0DataType);
+static constexpr ck::index_t Nswizzle    = false;
+static constexpr ck::index_t AK1         = 16 / sizeof(A0DataType);
+static constexpr ck::index_t BK1         = 16 / sizeof(B0DataType);
+static constexpr ck::index_t EVec        = 16 / sizeof(EDataType);
+static constexpr ck::index_t D0Vec       = 1;
+static constexpr ck::index_t D1Vec       = 1;
+static constexpr ck::index_t ActOP       = 1; // 0: gelu_and_mul, 1: silu_and_mul
+static constexpr bool MulRoutedWeight    = false;
+using DeviceOpInstance                   = ck::tensor_operation::device::DeviceMoeGemm
+    // clang-format off
+        <      Row,      Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
+               AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   
+               //threadnum, mblock, nblock, kblock
+               BLOCKSIZE,   MPerBlock,   NPerBlock,    KPerBlock,
+               // ak1, bk1
+               AK1,   BK1,
+               // mn_perxdl
+               MNPerXDL,   MNPerXDL,
+               // mn_xdlperwave 
+               MXDLPerWave,    NXDLPerWave,
+               // a,b: loadtranfer cluster, cluster order, srcorder,VECDIM, srcpervec, dstpervec, lds_extra
+               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0,
+               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, BK1, BK1, 0,
+               //    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+               //    MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+                //  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+                2,    2,   S<1, 32, 1, 8>, S<EVec, D0Vec, D1Vec>,
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, ActOP, Nswizzle, true, MulRoutedWeight, true, int32_t, A0DataType>;
+
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+
+    // GEMM shape
+    ck::index_t N               = 4096;
+    ck::index_t K               = 6144;
+    ck::index_t experts         = 8;
+    ck::index_t sorted_tile_num = 16;
+    ck::index_t valid_tile_num  = 13;
+    ck::index_t tokens          = 64;
+    ck::index_t topk            = 2;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 7)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        N               = std::stoi(argv[4]);
+        K               = std::stoi(argv[5]);
+        tokens          = std::stoi(argv[6]);
+    }
+    else if(argc == 9)
+    {
+
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        N               = std::stoi(argv[4]);
+        K               = std::stoi(argv[5]);
+        tokens          = std::stoi(argv[6]);
+        sorted_tile_num = std::stoi(argv[7]);
+        valid_tile_num  = std::stoi(argv[8]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 5: N, K, tokens\n");
+        exit(0);
+    }
+
+    ck::index_t sorted_size = sorted_tile_num * MPerBlock;
+    ck::index_t valid_size  = valid_tile_num * MPerBlock;
+    if(tokens * topk > valid_size)
+    {
+        printf("err config, tokens * topk > valid_size\n");
+        exit(-1);
+    }
+    ck::index_t StrideA              = K;
+    ck::index_t StrideB              = K;
+    ck::index_t StrideE              = N;
+    constexpr ck::index_t NumDTensor = DsDataType::Size();
+    constexpr auto StrideDs          = std::array<ck::index_t, NumDTensor>{1, 1, 1};
+
+    ck::index_t KBatch = 1;
+
+    Tensor<ck::index_t> expert_ids(HostTensorDescriptor({sorted_tile_num}, {1}));
+    Tensor<ck::index_t> sorted_token_ids(HostTensorDescriptor({sorted_size}, {1}));
+    Tensor<ck::index_t> max_token_id(HostTensorDescriptor({1 + sorted_tile_num}));
+    max_token_id.mData = {valid_size};
+    int eids[]         = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 3, 3, 3};
+    for(int i = 0; i < sorted_tile_num; i++)
+    {
+        expert_ids.mData[i] = eids[i];
+    }
+    int token_per_tile = (tokens * topk + valid_tile_num - 1) / valid_tile_num;
+    int tokenid        = 0;
+
+    for(int i = 0; i < sorted_size; i++)
+    {
+        int tile_off = i % MPerBlock;
+        if(tile_off < token_per_tile && tokenid < tokens * topk)
+        {
+            sorted_token_ids.mData[i] = (tokenid % tokens) | ((tokenid / tokens) << 24);
+            tokenid++;
+        }
+        else
+        {
+            sorted_token_ids.mData[i] = tokens;
+        }
+    }
+    Tensor<A0DataType> a0_t_k(HostTensorDescriptor({tokens, K}, {K, 1}));
+    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
+    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
+    Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}));
+    Tensor<D1DataType> d1_e_n(
+        HostTensorDescriptor({experts, N * 2}, {StrideDs[1] * N * 2, StrideDs[1]}));
+    Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
+    Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
+    Tensor<EDataType> e_t_n_device_result(
+        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
+    std::cout << "a0_t_k: " << a0_t_k.mDesc << std::endl;
+    std::cout << "b0_e_n_k: " << b0_e_n_k.mDesc << std::endl;
+    std::cout << "d1_e_n: " << d1_e_n.mDesc << std::endl;
+    std::cout << "d2_e_n: " << d2_e_n.mDesc << std::endl;
+    std::cout << "d0_t_n: " << d0_t_n.mDesc << std::endl;
+    std::cout << "d2_e_n: " << d2_e_n.mDesc << std::endl;
+    std::cout << "e_t_n: " << e_t_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        d0_t_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
+        d1_e_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+        break;
+    case 2:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        d0_t_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0, 1});
+        d1_e_n.GenerateTensorValue(GeneratorTensor_1<D1DataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{});
+        break;
+    case 3:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{0.0, 1.0});
+        d0_t_n.GenerateTensorValue(GeneratorTensor_1<D0DataType>{});
+        d1_e_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+        break;
+    default:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        d0_t_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
+        d1_e_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+    }
+    DeviceMem sorted_token_ids_dev(sizeof(ck::index_t) *
+                                   sorted_token_ids.mDesc.GetElementSpaceSize());
+    DeviceMem expert_ids_dev(sizeof(ck::index_t) * expert_ids.mDesc.GetElementSpaceSize());
+    DeviceMem max_token_id_dev(sizeof(ck::index_t) * max_token_id.mDesc.GetElementSpaceSize());
+    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_t_k.mDesc.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_e_n_k.mDesc.GetElementSpaceSize());
+    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_t_n.mDesc.GetElementSpaceSize());
+    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_e_n.mDesc.GetElementSpaceSize());
+    DeviceMem d2_device_buf(sizeof(D2DataType) * d2_e_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_t_n_device_result.mDesc.GetElementSpaceSize());
+
+    sorted_token_ids_dev.ToDevice(sorted_token_ids.mData.data());
+    expert_ids_dev.ToDevice(expert_ids.mData.data());
+    max_token_id_dev.ToDevice(max_token_id.mData.data());
+    a0_device_buf.ToDevice(a0_t_k.mData.data());
+    d0_device_buf.ToDevice(d0_t_n.mData.data());
+    d1_device_buf.ToDevice(d1_e_n.mData.data());
+    d2_device_buf.ToDevice(d2_e_n.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+
+    int NPerXdl = device_op.GetPreShuffleParameters();
+
+    preShuffleBuffer(
+        b0_e_n_k.mData.data(), b0_preshuffled.mData.data(), N * 2 * experts, K, NPerXdl);
+
+    b0_device_buf.ToDevice(b0_preshuffled.mData.data());
+
+    auto invoker = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(sorted_token_ids_dev.GetDeviceBuffer(),
+                               expert_ids_dev.GetDeviceBuffer(),
+                               max_token_id_dev.GetDeviceBuffer(),
+                               a0_device_buf.GetDeviceBuffer(),
+                               b0_device_buf.GetDeviceBuffer(),
+                               std::array<const void*, NumDTensor>{d0_device_buf.GetDeviceBuffer(),
+                                                                   d1_device_buf.GetDeviceBuffer(),
+                                                                   d2_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               tokens,
+                               topk,
+                               sorted_size,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               StrideDs,
+                               StrideE,
+                               KBatch,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+    if(time_kernel)
+    {
+        float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+        std::size_t flop      = std::size_t(2) * tokens * topk * N * 2 * K;
+        std::size_t num_btype = sizeof(A0DataType) * valid_tile_num * K +
+                                sizeof(B0DataType) * K * N * 2 * experts +
+                                sizeof(EDataType) * valid_tile_num * N;
+
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s" << std::endl;
+    }
+
+    if(do_verification)
+    {
+        invoker.Run(argument, StreamConfig{nullptr, false, 0, 0, 1});
+
+        e_device_buf.FromDevice(e_t_n_device_result.mData.data());
+
+        Tensor<CShuffleDataType> c_t_k_n({tokens, topk, N}, {topk * N, N, 1});
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceMoeGemm<A0DataType,
+                                                                                   B0DataType,
+                                                                                   CShuffleDataType,
+                                                                                   D2DataType,
+                                                                                   AccDataType,
+                                                                                   PassThrough,
+                                                                                   PassThrough,
+                                                                                   PassThrough,
+                                                                                   ActOP,
+                                                                                   MulRoutedWeight>;
+        auto ref_moe_gemm           = ReferenceGemmInstance{};
+        auto ref_invoker            = ref_moe_gemm.MakeInvoker();
+
+        auto ref_argument = ref_moe_gemm.MakeArgument(sorted_token_ids,
+                                                      expert_ids,
+                                                      max_token_id,
+                                                      MPerBlock,
+                                                      a0_t_k,
+                                                      d0_t_n,
+                                                      b0_e_n_k,
+                                                      d1_e_n,
+                                                      c_t_k_n,
+                                                      d2_e_n,
+                                                      PassThrough{},
+                                                      PassThrough{},
+                                                      PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+        for(int m = 0; m < valid_size; ++m)
+        {
+
+            const int fuse_t  = sorted_token_ids.mData[m];
+            const int t       = fuse_t & 0xffffff;
+            const int topk_id = (fuse_t & 0xff000000) >> 24;
+
+            if(t >= tokens)
+            {
+                continue;
+            }
+            const int e = expert_ids(m / MPerBlock);
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_t_n_host_result(t, topk_id, n),
+                               c_t_k_n(t, topk_id, n),
+                               d0_t_n(t, n),
+                               d1_e_n(e, n),
+                               d2_e_n(e, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_t_n_device_result.mData.data());
+
+        return ck::utils::check_err(
+                   e_t_n_device_result, e_t_n_host_result, "Error: Incorrect results!", 1e-3, 5e-1)
+                   ? 0
+                   : 1;
+    }
+
+    return 0;
+}
diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp
new file mode 100644
index 0000000000..9e80a2ca35
--- /dev/null
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp
@@ -0,0 +1,549 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using I4  = ck::pk_i4_t;
+using F16 = ck::half_t;
+using F8  = ck::f8_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType       = F8;
+using B0DataType       = I4;
+using EDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using D0DataType       = F32;
+using D1DataType       = F32;
+using D2DataType       = F32;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType, D2DataType>;
+
+using A0Layout = Row;
+using B0Layout = Col;
+using ELayout  = Row;
+using D0Layout = Row;
+using D1Layout = Col;
+using D2Layout = ELayout;
+using DsLayout = ck::Tuple<D0Layout, D1Layout, D2Layout>;
+
+// for gate, a_scale, b_scale
+struct MulABScale
+{
+    template <typename E, typename C, typename D0, typename D1>
+    __host__ __device__ constexpr void
+    operator()(E& e, const C& c, const D0& d0, const D1& d1) const;
+
+    template <>
+    __host__ __device__ constexpr void operator()<EDataType, EDataType, float, float>(
+        EDataType& e, const EDataType& c, const float& d0, const float& d1) const
+    {
+        (void)d0;
+        (void)d1;
+#if CK_USE_PK4_LAYOUT_SHUFFLE
+        e = ck::type_convert<EDataType>(c);
+#else
+        e = ck::type_convert<EDataType>(c);
+#endif
+    }
+    template <>
+    __host__ __device__ constexpr void operator()<EDataType, float, float, float>(
+        EDataType& e, const float& c, const float& d0, const float& d1) const
+    {
+        (void)d0;
+        (void)d1;
+#if CK_USE_PK4_LAYOUT_SHUFFLE
+        e = ck::type_convert<EDataType>(c);
+#else
+        e = ck::type_convert<EDataType>(c);
+#endif
+    }
+};
+
+struct MulABScaleExpertWeight
+{
+    template <typename E, typename C, typename D0, typename D1, typename D2>
+    __host__ __device__ constexpr void
+    operator()(E& e, const C& c, const D0& d0, const D1& d1, const D2& d2) const;
+    // for real kernel use
+    template <>
+    __host__ __device__ constexpr void operator()<EDataType, float, float, float, float>(
+        EDataType& e, const float& c, const float& d0, const float& d1, const float& d2) const
+    {
+        (void)d0;
+        (void)d1;
+        (void)d2;
+        e = ck::type_convert<EDataType>(c);
+    }
+    template <>
+    __host__ __device__ constexpr void operator()<EDataType, EDataType, float, float, float>(
+        EDataType& e, const EDataType& c, const float& d0, const float& d1, const float& d2) const
+    {
+        (void)d0;
+        (void)d1;
+        (void)d2;
+        e = ck::type_convert<EDataType>(c);
+    }
+    // for reference cpu
+    template <>
+    __host__ __device__ constexpr void operator()<float, float, float, float, float>(
+        float& e, const float& c, const float& d0, const float& d1, const float& d2) const
+    {
+        // for reference cpu
+        (void)d0;
+        (void)d1;
+        (void)d2;
+        e = ck::type_convert<EDataType>(c);
+    }
+};
+
+static constexpr bool MulRoutedWeight = true;
+
+using CDEElementOp = MulABScaleExpertWeight; // combine MulRoutedWeight = true
+
+// using CDEElementOp = MulABScale; // combine MulRoutedWeight = true
+
+#if 1
+void preShuffleBuffer(const I4* src, I4* dst, int N, int K, int NXdl)
+{
+    int KPack = 32;
+    int NLane = NXdl;
+    int KLane = 64 / NLane;
+
+    int K0 = K / (KLane * KPack);
+    // K -> K0 KLane KPack
+    // N -> N0 NLane
+    // N, K -> N0 K0 KLane NLane KPack
+    int tempk;
+    for(int n = 0; n < N; ++n)
+    {
+        for(int k = 0; k < K; ++k)
+        {
+            int n0 = n / NLane;
+            int n1 = n % NLane;
+
+            int k0 = k / (KLane * KPack);
+            tempk  = k % (KLane * KPack);
+            int k1 = tempk / KPack;
+            int k2 = tempk % KPack;
+
+            int outputIndex = n0 * KPack * NLane * KLane * K0 + k0 * KPack * NLane * KLane +
+                              k1 * KPack * NLane + n1 * KPack + k2;
+
+            dst[outputIndex / 2] = src[(n * K + k) / 2];
+        }
+    }
+}
+#endif
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+static constexpr ck::index_t MPerBlock = 128;
+static constexpr ck::index_t Nswizzle  = false;
+static constexpr ck::index_t Act_OP    = 1; // 0: gelu_and_mul, 1: silu_and_mul
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemm<
+            Row, Col, DsLayout, ELayout, 
+            A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
+            AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   
+            256,   MPerBlock,   64,    128,
+            16,   32,
+            16,   16,
+            8,    1,
+            S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
+            S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0,
+            2,    1,   S<1, 32, 1, 8>, S<8, 1, 1>,
+            ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, Act_OP, Nswizzle, true, MulRoutedWeight, true, ck::index_t, A0DataType>;
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+
+    // per expert:
+    // GEMM shape
+    ck::index_t N               = 14336;
+    ck::index_t K               = 4096;
+    ck::index_t experts         = 8;
+    ck::index_t sorted_tile_num = 16;
+    ck::index_t valid_tile_num  = 13;
+    ck::index_t sorted_size     = sorted_tile_num * MPerBlock;
+    ck::index_t valid_size      = valid_tile_num * MPerBlock;
+    ck::index_t tokens          = 644;
+    ck::index_t topk            = 2;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 7)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        N               = std::stoi(argv[4]);
+        K               = std::stoi(argv[5]);
+        tokens          = std::stoi(argv[6]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 5: N, K, tokens\n");
+        exit(0);
+    }
+
+    if(tokens * topk > valid_size)
+    {
+        printf("err config, tokens * topk > valid_size\n");
+        exit(-1);
+    }
+    ck::index_t StrideA              = K;
+    ck::index_t StrideB              = K;
+    ck::index_t StrideE              = N;
+    constexpr ck::index_t NumDTensor = DsDataType::Size();
+    constexpr auto StrideDs          = std::array<ck::index_t, NumDTensor>{1, 1, 1};
+
+    ck::index_t KBatch = 1;
+
+    Tensor<ck::index_t> expert_ids(HostTensorDescriptor({sorted_tile_num}, {1}));
+    Tensor<ck::index_t> sorted_token_ids(HostTensorDescriptor({sorted_size}, {1}));
+    Tensor<ck::index_t> max_token_id(HostTensorDescriptor({1 + sorted_tile_num}));
+    max_token_id.mData = {valid_size};
+    int eids[]         = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 7, 7, 3, 3, 3};
+    for(int i = 0; i < sorted_tile_num; i++)
+    {
+        expert_ids.mData[i] = eids[i];
+    }
+    int token_per_tile = (tokens * topk + valid_tile_num - 1) / valid_tile_num;
+    int tokenid        = 0;
+    for(int i = 0; i < sorted_size; i++)
+    {
+        int tile_off = i % MPerBlock;
+        if(tile_off < token_per_tile)
+        {
+            sorted_token_ids.mData[i] = (tokenid % tokens) | ((tokenid / tokens) << 24);
+            tokenid++;
+        }
+        else
+        {
+            sorted_token_ids.mData[i] = tokens;
+        }
+    }
+
+    Tensor<A0DataType> a0_t_k(HostTensorDescriptor({tokens, K}, {K, 1}));
+    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
+    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
+    Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}));
+    Tensor<D1DataType> d1_e_n(
+        HostTensorDescriptor({experts, N * 2}, {StrideDs[1] * N * 2, StrideDs[1]}));
+    Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
+    Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
+    Tensor<EDataType> e_t_n_device_result(
+        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
+
+    std::cout << "a0_t_k: " << a0_t_k.mDesc << std::endl;
+    std::cout << "b0_e_n_k: " << b0_e_n_k.mDesc << std::endl;
+    std::cout << "d2_e_n: " << d2_e_n.mDesc << std::endl;
+    std::cout << "d1_e_n: " << d1_e_n.mDesc << std::endl;
+    std::cout << "d0_t_n: " << d0_t_n.mDesc << std::endl;
+    std::cout << "e_t_n: " << e_t_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        d0_t_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
+        d1_e_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+        break;
+    case 2:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        d0_t_n.GenerateTensorValue(GeneratorTensor_1<D0DataType>{});
+        d1_e_n.GenerateTensorValue(GeneratorTensor_1<D1DataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{});
+        break;
+    default:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        d0_t_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
+        d1_e_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+    }
+    DeviceMem sorted_token_ids_dev(sizeof(ck::index_t) *
+                                   sorted_token_ids.mDesc.GetElementSpaceSize());
+    DeviceMem expert_ids_dev(sizeof(ck::index_t) * expert_ids.mDesc.GetElementSpaceSize());
+    DeviceMem max_token_id_dev(sizeof(ck::index_t) * max_token_id.mDesc.GetElementSpaceSize());
+    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_t_k.mDesc.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_e_n_k.mDesc.GetElementSpaceSize() / 2);
+    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_t_n.mDesc.GetElementSpaceSize());
+    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_e_n.mDesc.GetElementSpaceSize());
+    DeviceMem d2_device_buf(sizeof(D2DataType) * d2_e_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_t_n_device_result.mDesc.GetElementSpaceSize());
+
+    sorted_token_ids_dev.ToDevice(sorted_token_ids.mData.data());
+    expert_ids_dev.ToDevice(expert_ids.mData.data());
+    max_token_id_dev.ToDevice(max_token_id.mData.data());
+    a0_device_buf.ToDevice(a0_t_k.mData.data());
+    d0_device_buf.ToDevice(d0_t_n.mData.data());
+    d1_device_buf.ToDevice(d1_e_n.mData.data());
+    d2_device_buf.ToDevice(d2_e_n.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+
+#if 1
+    preShuffleBuffer(b0_e_n_k.mData.data(),
+                     b0_preshuffled.mData.data(),
+                     N * experts,
+                     K,
+                     device_op.GetPreShuffleParameters());
+#else
+    // weight pre-shuffle
+    int KPack = 32; // int4 -> 32, fp8 -> 16, fp16 -> 8
+    int NLane = device_op.GetPreShuffleParameters();
+    int KLane = 64 / NLane;
+
+    int K0 = K / (KLane * KPack);
+    // K -> K0 KLane KPack
+    // N -> N0 NLane
+    // N, K -> N0 K0 KLane NLane KPack
+    int tempk;
+    for(int e = 0; e < experts; ++e)
+    {
+        for(int n = 0; n < N; ++n)
+        {
+            for(int k = 0; k < K; ++k)
+            {
+                int n0 = n / NLane;
+                int n1 = n % NLane;
+
+                int k0 = k / (KLane * KPack);
+                tempk = k % (KLane * KPack);
+                int k1 = tempk / KPack;
+                int k2 = tempk % KPack;
+
+                int outputIndex = n0 * KPack * NLane * KLane * K0 + k0 * KPack * NLane * KLane +
+                                  k1 * KPack * NLane + n1 * KPack + k2;
+
+                b0_preshuffled(e, outputIndex % K, outputIndex / K) = b0_e_n_k(e, k, n);
+            }
+        }
+    }
+#endif
+
+#if CK_USE_PK4_LAYOUT_SHUFFLE
+    // vector pk_i4x4 permute
+    for(int e = 0; e < experts; e++)
+    {
+        for(int i = 0; i < N; i++)
+        {
+            for(int j = 0; j < K; j += 8)
+            {
+                int input[8];
+
+                for(int k = 0; k < 4; k++)
+                {
+                    int i4x2         = b0_preshuffled(e, j + k * 2, i).data;
+                    input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
+                    input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
+                }
+
+                // permute 01234567->20643175
+                {
+                    int hi   = input[2];
+                    int lo   = input[0];
+                    int i4x2 = (hi << 4) | lo;
+
+                    b0_preshuffled(e, j + 0, i) = i4x2;
+                }
+
+                {
+                    int hi   = input[6];
+                    int lo   = input[4];
+                    int i4x2 = (hi << 4) | lo;
+
+                    b0_preshuffled(e, j + 2, i) = i4x2;
+                }
+
+                {
+                    int hi   = input[3];
+                    int lo   = input[1];
+                    int i4x2 = (hi << 4) | lo;
+
+                    b0_preshuffled(e, j + 4, i) = i4x2;
+                }
+
+                {
+                    int hi   = input[7];
+                    int lo   = input[5];
+                    int i4x2 = (hi << 4) | lo;
+
+                    b0_preshuffled(e, j + 6, i) = i4x2;
+                }
+            }
+        }
+    }
+#endif
+
+    b0_device_buf.ToDevice(b0_preshuffled.mData.data());
+
+    auto invoker = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(sorted_token_ids_dev.GetDeviceBuffer(),
+                               expert_ids_dev.GetDeviceBuffer(),
+                               max_token_id_dev.GetDeviceBuffer(),
+                               a0_device_buf.GetDeviceBuffer(),
+                               b0_device_buf.GetDeviceBuffer(),
+                               std::array<const void*, NumDTensor>{d0_device_buf.GetDeviceBuffer(),
+                                                                   d1_device_buf.GetDeviceBuffer(),
+                                                                   d2_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               tokens,
+                               topk,
+                               sorted_size,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               StrideDs,
+                               StrideE,
+                               KBatch,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    {
+        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+    }
+
+    if(time_kernel)
+    {
+        float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+        std::size_t flop      = std::size_t(2) * tokens * topk * N * 2 * K;
+        std::size_t num_btype = sizeof(A0DataType) * valid_tile_num * K +
+                                sizeof(B0DataType) / 2 * K * N * 2 * experts +
+                                sizeof(EDataType) * valid_tile_num * N;
+
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s" << device_op.GetTypeString() << std::endl;
+    }
+
+    if(do_verification)
+    {
+        invoker.Run(argument, StreamConfig{nullptr, false, 0, 0, 1});
+
+        e_device_buf.FromDevice(e_t_n_device_result.mData.data());
+
+        Tensor<CShuffleDataType> c_t_k_n({tokens, topk, N}, {topk * N, N, 1});
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceMoeGemm<A0DataType,
+                                                                                   B0DataType,
+                                                                                   CShuffleDataType,
+                                                                                   D2DataType,
+                                                                                   AccDataType,
+                                                                                   PassThrough,
+                                                                                   PassThrough,
+                                                                                   PassThrough,
+                                                                                   Act_OP,
+                                                                                   MulRoutedWeight>;
+        auto ref_moe_gemm           = ReferenceGemmInstance{};
+        auto ref_invoker            = ref_moe_gemm.MakeInvoker();
+
+        auto ref_argument = ref_moe_gemm.MakeArgument(sorted_token_ids,
+                                                      expert_ids,
+                                                      max_token_id,
+                                                      MPerBlock,
+                                                      a0_t_k,
+                                                      d0_t_n,
+                                                      b0_e_n_k,
+                                                      d1_e_n,
+                                                      c_t_k_n,
+                                                      d2_e_n,
+                                                      PassThrough{},
+                                                      PassThrough{},
+                                                      PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+        for(int m = 0; m < valid_size; ++m)
+        {
+
+            const int fuse_t  = sorted_token_ids.mData[m];
+            const int t       = fuse_t & 0xffffff;
+            const int topk_id = (fuse_t & 0xff000000) >> 24;
+
+            if(t >= tokens)
+            {
+                continue;
+            }
+            const int e = expert_ids(m / MPerBlock);
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_t_n_host_result(t, topk_id, n),
+                               c_t_k_n(t, topk_id, n),
+                               d0_t_n(t, n),
+                               d1_e_n(e, n),
+                               d2_e_n(e, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_t_n_device_result.mData.data());
+        return ck::utils::check_err(
+                   e_t_n_device_result, e_t_n_host_result, "Error: Incorrect results!", 1e-3, 5e-1)
+                   ? 0
+                   : 1;
+    }
+
+    return 0;
+}
diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
new file mode 100644
index 0000000000..42d892fe26
--- /dev/null
+++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
@@ -0,0 +1,455 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F8  = ck::f8_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType       = F8;
+using B0DataType       = F8;
+using EDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using D0DataType       = F32;
+using D1DataType       = F32;
+using D2DataType       = F32;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType, D2DataType>;
+
+using A0Layout = Row;
+using B0Layout = Col;
+using ELayout  = Row;
+using D0Layout = Row;
+using D1Layout = Col;
+using D2Layout = ELayout;
+using DsLayout = ck::Tuple<D0Layout, D1Layout, D2Layout>;
+
+// d0: ascale, d1: bscale, d2:expert weight
+struct MulABScaleExpertWeight
+{
+    template <typename E, typename C, typename D0, typename D1, typename D2>
+    __host__ __device__ constexpr void
+    operator()(E& e, const C& c, const D0& d0, const D1& d1, const D2& d2) const;
+    // for real kernel use
+    template <>
+    __host__ __device__ constexpr void operator()<EDataType, float, float, float, float>(
+        EDataType& e, const float& c, const float& d0, const float& d1, const float& d2) const
+    {
+        (void)d0;
+        (void)d1;
+        (void)d2;
+        e = ck::type_convert<EDataType>(c);
+    }
+    template <>
+    __host__ __device__ constexpr void operator()<EDataType, EDataType, float, float, float>(
+        EDataType& e, const EDataType& c, const float& d0, const float& d1, const float& d2) const
+    {
+        (void)d0;
+        (void)d1;
+        (void)d2;
+        e = ck::type_convert<EDataType>(c);
+    }
+    // for reference cpu
+    template <>
+    __host__ __device__ constexpr void operator()<float, float, float, float, float>(
+        float& e, const float& c, const float& d0, const float& d1, const float& d2) const
+    {
+        // for reference cpu
+        e = ck::type_convert<EDataType>(c * d0 * d1 * d2);
+    }
+};
+
+using CDEElementOp = MulABScaleExpertWeight;
+
+void preShuffleBuffer(const B0DataType* src, B0DataType* dst, int N, int K, int NXdl)
+{
+    int KPack = 16 / sizeof(B0DataType);
+    int NLane = NXdl;
+    int KLane = 64 / NLane;
+
+    int K0 = K / (KLane * KPack);
+    // K -> K0 KLane KPack
+    // N -> N0 NLane
+    // N, K -> N0 K0 KLane NLane KPack
+    int tempk;
+    for(int n = 0; n < N; ++n)
+    {
+        for(int k = 0; k < K; ++k)
+        {
+            int n0 = n / NLane;
+            int n1 = n % NLane;
+
+            int k0 = k / (KLane * KPack);
+            tempk  = k % (KLane * KPack);
+            int k1 = tempk / KPack;
+            int k2 = tempk % KPack;
+
+            int outputIndex = n0 * KPack * NLane * KLane * K0 + k0 * KPack * NLane * KLane +
+                              k1 * KPack * NLane + n1 * KPack + k2;
+
+            dst[outputIndex] = src[n * K + k];
+        }
+    }
+}
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = MulABScaleExpertWeight;
+
+static constexpr auto GemmSpec         = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr ck::index_t MPerBlock = 128;
+static constexpr ck::index_t BLOCKSIZE = 256;
+static constexpr ck::index_t MXDLPerWave = 4;
+static constexpr ck::index_t NXDLPerWave = 4;
+static constexpr ck::index_t NPerBlock   = 128;
+static constexpr ck::index_t MNPerXDL    = 16;
+static constexpr ck::index_t KPerBlock   = 128 / sizeof(A0DataType);
+
+static constexpr ck::index_t CShuffleNLane = 32;
+static constexpr ck::index_t CShuffleMLane = BLOCKSIZE / CShuffleNLane;
+static constexpr ck::index_t AK1           = 16 / sizeof(A0DataType);
+static constexpr ck::index_t BK1           = 16 / sizeof(B0DataType);
+static constexpr ck::index_t EVec          = 2;
+static constexpr ck::index_t D0Vec         = 1;
+static constexpr ck::index_t D1Vec         = 1;
+static constexpr ck::index_t D2Vec         = 1;
+static constexpr bool MulRoutedWeight      = true;
+using DeviceOpInstance                     = ck::tensor_operation::device::DeviceMoeGemm
+    // clang-format off
+///######|  ALayout|  BLayout| DsLayout| ELayout|      AData|      BData|     DsData|     EData|     AccData|         CShuffle|           A|           B|          CDE|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+///######|         |         |         |        |       Type|       Type|       Type|      Type|        Type|         DataType| Elementwise| Elementwise|  Elementwise| Spacialization|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+///######|         |         |         |        |           |           |           |          |            |                 |   Operation|   Operation|    Operation|               |      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+///######|         |         |         |        |           |           |           |          |            |                 |            |            |             |               |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |    S<C, D0, D1>|
+///###### RCR
+        // kernel 1: 256->32x128x128 
+        // <      Row,      Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   256,   32,   128,    128,  16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,     S<8, 32, 1>,    S<1, 0, 2>,     S<1, 0, 2>,             2,              16,             16,          0,          1,           1,               S<1, 32, 1, 8>,      S<8, 8, 1>,  ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, EDataType>;
+        // <      Row,      Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   256,   32,   128,    256,  16,  16,  32,   32,    1,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,     S<16, 16, 1>,    S<1, 0, 2>,     S<1, 0, 2>,             2,              16,             16,          0,          1,           1,               S<1, 32, 1, 8>,      S<8, 8, 1>,  ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, EDataType>;
+        <      Row,      Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
+               AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   
+               //threadnum, mblock, nblock, kblock
+               BLOCKSIZE,   MPerBlock,   NPerBlock,    KPerBlock,
+               // ak1, bk1
+               AK1,   BK1,
+               // mn_perxdl
+               MNPerXDL,   MNPerXDL,
+               // mn_xdlperwave 
+               MXDLPerWave,  NXDLPerWave,
+               // a,b: loadtranfer cluster, cluster order, srcorder,VECDIM, srcpervec, dstpervec, lds_extra
+            //    S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0,
+            //    S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0,
+               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0,
+               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0,
+               //    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+               //    MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+                //  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+               4,        2,         S<1, CShuffleMLane, 1, CShuffleNLane>, S<EVec, D0Vec, D1Vec, D2Vec>,
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, 0, false, false, MulRoutedWeight, false, int32_t, A0DataType>;
+        // kernel 2: 128->32x128x128
+        //  <      Row,      Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   128,   32,   128,    128,  16,  16,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,     S<8, 16, 1>,    S<1, 0, 2>,     S<1, 0, 2>,             2,              16,             16,          0,          1,           1,               S<1, 16, 1, 8>,      S<8, 8, 1>,  ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, EDataType>;
+
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+
+    // per expert:
+    // GEMM shape
+    ck::index_t N               = 4096;
+    ck::index_t K               = 4096;
+    ck::index_t experts         = 8;
+    ck::index_t sorted_tile_num = 16;
+    ck::index_t valid_tile_num  = 13;
+    ck::index_t sorted_size     = sorted_tile_num * MPerBlock;
+    ck::index_t valid_size      = valid_tile_num * MPerBlock;
+    ck::index_t tokens          = 128;
+    ck::index_t topk            = 2;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 3)
+    {
+        // use default case
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 7)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        N               = std::stoi(argv[4]);
+        K               = std::stoi(argv[5]);
+        tokens          = std::stoi(argv[6]);
+    }
+    else if(argc == 9)
+    {
+
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        N               = std::stoi(argv[4]);
+        K               = std::stoi(argv[5]);
+        tokens          = std::stoi(argv[6]);
+        sorted_tile_num = std::stoi(argv[7]);
+        valid_tile_num  = std::stoi(argv[8]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 6: N, K, tokens\n");
+        exit(0);
+    }
+
+    ck::index_t StrideA              = K;
+    ck::index_t StrideB              = K;
+    ck::index_t StrideE              = N;
+    constexpr ck::index_t NumDTensor = DsDataType::Size();
+    constexpr auto StrideDs          = std::array<ck::index_t, NumDTensor>{0, 0, 0};
+
+    ck::index_t KBatch = 1;
+
+    Tensor<ck::index_t> expert_ids(HostTensorDescriptor({sorted_tile_num}, {1}));
+    Tensor<ck::index_t> sorted_token_ids(HostTensorDescriptor({sorted_size}, {1}));
+    Tensor<ck::index_t> max_token_id(HostTensorDescriptor({1}));
+
+    max_token_id.mData = {valid_size, 0, 2, 3, 4, 6, 8, 10, 12, 13};
+    int eids[]         = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 7, 7, 3, 3, 3};
+
+    for(int i = 0; i < sorted_tile_num; i++)
+    {
+        expert_ids.mData[i] = eids[i];
+    }
+    if(tokens * topk > valid_size)
+    {
+        printf("err config, tokens * topk > valid_size\n");
+        exit(-1);
+    }
+    int token_per_tile = tokens * topk / valid_tile_num;
+    int tokenid        = 0;
+
+    for(int i = 0; i < sorted_size; i++)
+    {
+        int tile_off = i % MPerBlock;
+        if(tile_off < token_per_tile && tokenid < tokens * topk)
+        {
+            sorted_token_ids.mData[i] = (tokenid % tokens) | ((tokenid / tokens) << 24);
+            tokenid++;
+        }
+        else
+        {
+            sorted_token_ids.mData[i] = tokens;
+        }
+    }
+
+    Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}));
+    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
+    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
+    Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}));
+    Tensor<D1DataType> d1_e_n(HostTensorDescriptor({experts, N}, {1, StrideDs[1]}));
+    Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
+    Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
+    Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));
+    e_t_n_device_result.SetZero();
+    std::cout << "a0_t_k_k: " << a0_t_k_k.mDesc << std::endl;
+    std::cout << "b0_e_n_k: " << b0_e_n_k.mDesc << std::endl;
+    std::cout << "d2_e_n: " << d2_e_n.mDesc << std::endl;
+    std::cout << "d1_e_n: " << d1_e_n.mDesc << std::endl;
+    std::cout << "d0_t_n: " << d0_t_n.mDesc << std::endl;
+    std::cout << "e_t_n: " << e_t_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        d0_t_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-2, 2});
+        d1_e_n.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-2, 2});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_2<D2DataType>{-2, 2});
+        break;
+    case 2:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        d0_t_n.GenerateTensorValue(GeneratorTensor_1<D0DataType>{});
+        d1_e_n.GenerateTensorValue(GeneratorTensor_1<D1DataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    default:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        d0_t_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
+        d1_e_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+    }
+    DeviceMem sorted_token_ids_dev(sizeof(ck::index_t) *
+                                   sorted_token_ids.mDesc.GetElementSpaceSize());
+    DeviceMem expert_ids_dev(sizeof(ck::index_t) * expert_ids.mDesc.GetElementSpaceSize());
+    DeviceMem max_token_id_dev(sizeof(ck::index_t) * max_token_id.mDesc.GetElementSpaceSize());
+    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_t_k_k.mDesc.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_e_n_k.mDesc.GetElementSpaceSize());
+    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_t_n.mDesc.GetElementSpaceSize());
+    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_e_n.mDesc.GetElementSpaceSize());
+    DeviceMem d2_device_buf(sizeof(D2DataType) * d2_e_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_t_n_device_result.mDesc.GetElementSpaceSize());
+
+    sorted_token_ids_dev.ToDevice(sorted_token_ids.mData.data());
+    expert_ids_dev.ToDevice(expert_ids.mData.data());
+    max_token_id_dev.ToDevice(max_token_id.mData.data());
+    a0_device_buf.ToDevice(a0_t_k_k.mData.data());
+    d0_device_buf.ToDevice(d0_t_n.mData.data());
+    d1_device_buf.ToDevice(d1_e_n.mData.data());
+    d2_device_buf.ToDevice(d2_e_n.mData.data());
+    e_device_buf.ToDevice(e_t_n_device_result.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+
+    int NPerXdl = device_op.GetPreShuffleParameters();
+
+    preShuffleBuffer(b0_e_n_k.mData.data(), b0_preshuffled.mData.data(), N * experts, K, NPerXdl);
+
+    b0_device_buf.ToDevice(b0_preshuffled.mData.data());
+
+    auto invoker = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(sorted_token_ids_dev.GetDeviceBuffer(),
+                               expert_ids_dev.GetDeviceBuffer(),
+                               max_token_id_dev.GetDeviceBuffer(),
+                               a0_device_buf.GetDeviceBuffer(),
+                               b0_device_buf.GetDeviceBuffer(),
+                               std::array<const void*, NumDTensor>{d0_device_buf.GetDeviceBuffer(),
+                                                                   d1_device_buf.GetDeviceBuffer(),
+                                                                   d2_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               tokens,
+                               topk,
+                               sorted_size,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               StrideDs,
+                               StrideE,
+                               KBatch,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+    if(time_kernel)
+    {
+        // not result correct here because output buf not setzero
+        float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+        std::size_t flop      = std::size_t(2) * tokens * topk * N * K;
+        std::size_t num_btype = sizeof(A0DataType) * tokens * K * topk +
+                                sizeof(B0DataType) * K * N * experts +
+                                sizeof(EDataType) * tokens * N;
+
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s" << std::endl;
+    }
+
+    if(do_verification)
+    {
+        // gemm2 use atomic, so need to reinit outputs
+        e_device_buf.ToDevice(e_t_n_device_result.mData.data());
+        invoker.Run(argument, StreamConfig{nullptr, false, 0, 0, 1});
+
+        Tensor<float> c_t_n({tokens, N});
+
+        using ReferenceGemmInstance =
+            ck::tensor_operation::host::ReferenceMoeGemm2<A0DataType,
+                                                          B0DataType,
+                                                          D0DataType,
+                                                          D1DataType,
+                                                          D2DataType,
+                                                          float,
+                                                          AccDataType,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          CDEElementOp,
+                                                          MulRoutedWeight>;
+        auto ref_moe_gemm = ReferenceGemmInstance{};
+        auto ref_invoker  = ref_moe_gemm.MakeInvoker();
+        auto ref_argument = ref_moe_gemm.MakeArgument(sorted_token_ids,
+                                                      expert_ids,
+                                                      max_token_id,
+                                                      MPerBlock,
+                                                      a0_t_k_k,
+                                                      b0_e_n_k,
+                                                      d0_t_n,
+                                                      d1_e_n,
+                                                      d2_e_n,
+                                                      c_t_n,
+                                                      PassThrough{},
+                                                      PassThrough{},
+                                                      cde_element_op);
+
+        ref_invoker.Run(ref_argument);
+        for(int t = 0; t < tokens; ++t)
+        {
+
+            for(int n = 0; n < N; ++n)
+            {
+                e_t_n_host_result(t, n) = ck::type_convert<EDataType>(c_t_n(t, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_t_n_device_result.mData.data());
+
+        return ck::utils::check_err(
+                   e_t_n_device_result, e_t_n_host_result, "Error: Incorrect results!", 1e-3, 5e-2)
+                   ? 0
+                   : 1;
+    }
+
+    return 0;
+}
diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp
new file mode 100644
index 0000000000..b9621cc9b3
--- /dev/null
+++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp
@@ -0,0 +1,495 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using I4  = ck::pk_i4_t;
+using F16 = ck::half_t;
+using F8  = ck::f8_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType       = F8;
+using B0DataType       = I4;
+using EDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using D0DataType       = F32;
+using D1DataType       = F32;
+using D2DataType       = F32;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType, D2DataType>;
+
+using A0Layout = Row;
+using B0Layout = Col;
+using ELayout  = Row;
+using D0Layout = Row;
+using D1Layout = Col;
+using D2Layout = ELayout;
+using DsLayout = ck::Tuple<D0Layout, D1Layout, D2Layout>;
+
+// d0: ascale, d1: bscale, d2:expert weight
+struct MulABScaleExpertWeight
+{
+    template <typename E, typename C, typename D0, typename D1, typename D2>
+    __host__ __device__ constexpr void
+    operator()(E& e, const C& c, const D0& d0, const D1& d1, const D2& d2) const;
+    // for real kernel use
+    template <>
+    __host__ __device__ constexpr void operator()<EDataType, float, float, float, float>(
+        EDataType& e, const float& c, const float& d0, const float& d1, const float& d2) const
+    {
+        (void)d0;
+        (void)d1;
+        (void)d2;
+
+#if CK_USE_PK4_LAYOUT_SHUFFLE
+        e = ck::type_convert<EDataType>(c * 16);
+#else
+        e = ck::type_convert<EDataType>(c);
+#endif
+    }
+    // for reference cpu
+    template <>
+    __host__ __device__ constexpr void operator()<float, float, float, float, float>(
+        float& e, const float& c, const float& d0, const float& d1, const float& d2) const
+    {
+        // for reference cpu
+#if CK_USE_PK4_LAYOUT_SHUFFLE
+        e = ck::type_convert<EDataType>(c * d0 * d1 * d2 * 16);
+#else
+        e = ck::type_convert<EDataType>(c * d0 * d1 * d2);
+#endif
+    }
+};
+
+using CDEElementOp = MulABScaleExpertWeight;
+
+void preShuffleBuffer(const I4* src, I4* dst, int N, int K, int NXdl)
+{
+    int KPack = 32;
+    int NLane = NXdl;
+    int KLane = 64 / NLane;
+
+    int K0 = K / (KLane * KPack);
+    // K -> K0 KLane KPack
+    // N -> N0 NLane
+    // N, K -> N0 K0 KLane NLane KPack
+    int tempk;
+    for(int n = 0; n < N; ++n)
+    {
+        for(int k = 0; k < K; ++k)
+        {
+            int n0 = n / NLane;
+            int n1 = n % NLane;
+
+            int k0 = k / (KLane * KPack);
+            tempk  = k % (KLane * KPack);
+            int k1 = tempk / KPack;
+            int k2 = tempk % KPack;
+
+            int outputIndex = n0 * KPack * NLane * KLane * K0 + k0 * KPack * NLane * KLane +
+                              k1 * KPack * NLane + n1 * KPack + k2;
+
+            dst[outputIndex / 2] = src[(n * K + k) / 2];
+        }
+    }
+}
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = MulABScaleExpertWeight;
+
+static constexpr auto GemmSpec         = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr ck::index_t MPerBlock = 128;
+static constexpr ck::index_t BLOCKSIZE = 256;
+static constexpr ck::index_t MXDLPerWave   = 8;
+static constexpr ck::index_t NXDLPerWave   = 2;
+static constexpr ck::index_t NPerBlock     = 128;
+static constexpr ck::index_t MNPerXDL      = 16;
+static constexpr ck::index_t KPerBlock     = 128 / sizeof(A0DataType);
+static constexpr ck::index_t CShuffleNLane = 32;
+static constexpr ck::index_t CShuffleMLane = BLOCKSIZE / CShuffleNLane;
+static constexpr ck::index_t AK1           = 16 / sizeof(A0DataType);
+static constexpr ck::index_t BK1           = 32 / sizeof(B0DataType);
+static constexpr ck::index_t EVec          = 2;
+static constexpr ck::index_t D0Vec         = 1;
+static constexpr ck::index_t D1Vec         = 1;
+static constexpr ck::index_t D2Vec         = 1;
+static constexpr bool MulRoutedWeight      = true;
+using DeviceOpInstance                     = ck::tensor_operation::device::DeviceMoeGemm
+    // clang-format off
+        <      Row, Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
+               AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   
+               BLOCKSIZE,   MPerBlock,   NPerBlock,    KPerBlock,
+               AK1,   BK1,
+               MNPerXDL,   MNPerXDL,
+               MXDLPerWave,    NXDLPerWave,
+               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0,
+               S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, BK1, BK1, 0,
+               2,    2,   S<1, CShuffleMLane, 1, CShuffleNLane>, S<EVec, D0Vec, D1Vec, D2Vec>,
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, 0, false, false, MulRoutedWeight, false, ck::index_t, A0DataType>;
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+
+    // per expert:
+    // GEMM shape
+    ck::index_t N               = 4096;
+    ck::index_t K               = 14336;
+    ck::index_t experts         = 8;
+    ck::index_t sorted_tile_num = 19;
+    ck::index_t valid_tile_num  = 16;
+    ck::index_t sorted_size     = sorted_tile_num * MPerBlock;
+    ck::index_t valid_size      = valid_tile_num * MPerBlock;
+    ck::index_t tokens          = 512;
+    ck::index_t topk            = 2;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 3)
+    {
+        // use default case
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 7)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        N               = std::stoi(argv[4]);
+        K               = std::stoi(argv[5]);
+        tokens          = std::stoi(argv[6]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 6: N, K, tokens\n");
+        exit(0);
+    }
+
+    ck::index_t StrideA              = K;
+    ck::index_t StrideB              = K;
+    ck::index_t StrideE              = N;
+    constexpr ck::index_t NumDTensor = DsDataType::Size();
+    constexpr auto StrideDs          = std::array<ck::index_t, NumDTensor>{0, 0, 0};
+
+    ck::index_t KBatch = 1;
+
+    Tensor<ck::index_t> expert_ids(HostTensorDescriptor({sorted_tile_num}, {1}));
+    Tensor<ck::index_t> sorted_token_ids(HostTensorDescriptor({sorted_size}, {1}));
+    Tensor<ck::index_t> max_token_id(HostTensorDescriptor({1}));
+    max_token_id.mData[0] = valid_size;
+    int eids[]            = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 3, 3, 3};
+    for(int i = 0; i < sorted_tile_num; i++)
+    {
+        expert_ids.mData[i] = eids[i];
+    }
+    if(tokens * topk > valid_size)
+    {
+        printf("err config, tokens * topk > valid_size\n");
+        exit(-1);
+    }
+    int token_per_tile = tokens * topk / valid_tile_num;
+    int tokenid        = 0;
+    for(int i = 0; i < sorted_size; i++)
+    {
+        int tile_off = i % MPerBlock;
+        if(tile_off < token_per_tile)
+        {
+            sorted_token_ids.mData[i] = (tokenid % tokens) | ((tokenid / tokens) << 24);
+            tokenid++;
+        }
+        else
+        {
+            sorted_token_ids.mData[i] = tokens;
+        }
+    }
+    Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}));
+    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
+    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
+    Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}));
+    Tensor<D1DataType> d1_e_n(HostTensorDescriptor({experts, N}, {1, StrideDs[1]}));
+    Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
+    Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
+    Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));
+    e_t_n_device_result.SetZero();
+    std::cout << "a0_t_k_k: " << a0_t_k_k.mDesc << std::endl;
+    std::cout << "b0_e_n_k: " << b0_e_n_k.mDesc << std::endl;
+    std::cout << "d2_e_n: " << d2_e_n.mDesc << std::endl;
+    std::cout << "d1_e_n: " << d1_e_n.mDesc << std::endl;
+    std::cout << "d0_t_n: " << d0_t_n.mDesc << std::endl;
+    std::cout << "e_t_n: " << e_t_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        d0_t_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-2, 2});
+        d1_e_n.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-2, 2});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_2<D2DataType>{-2, 2});
+        break;
+    case 2:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        d0_t_n.GenerateTensorValue(GeneratorTensor_1<D0DataType>{});
+        d1_e_n.GenerateTensorValue(GeneratorTensor_1<D1DataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    case 3:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        d0_t_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
+        d1_e_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+        break;
+    case 4:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        d0_t_n.GenerateTensorValue(GeneratorTensor_1<D0DataType>{});
+        d1_e_n.GenerateTensorValue(GeneratorTensor_1<D1DataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    default:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        d0_t_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
+        d1_e_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+    }
+    DeviceMem sorted_token_ids_dev(sizeof(ck::index_t) *
+                                   sorted_token_ids.mDesc.GetElementSpaceSize());
+    DeviceMem expert_ids_dev(sizeof(ck::index_t) * expert_ids.mDesc.GetElementSpaceSize());
+    DeviceMem max_token_id_dev(sizeof(ck::index_t) * max_token_id.mDesc.GetElementSpaceSize());
+    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_t_k_k.mDesc.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_e_n_k.mDesc.GetElementSpaceSize() / 2);
+    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_t_n.mDesc.GetElementSpaceSize());
+    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_e_n.mDesc.GetElementSpaceSize());
+    DeviceMem d2_device_buf(sizeof(D2DataType) * d2_e_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_t_n_device_result.mDesc.GetElementSpaceSize());
+
+    sorted_token_ids_dev.ToDevice(sorted_token_ids.mData.data());
+    expert_ids_dev.ToDevice(expert_ids.mData.data());
+    max_token_id_dev.ToDevice(max_token_id.mData.data());
+    a0_device_buf.ToDevice(a0_t_k_k.mData.data());
+    d0_device_buf.ToDevice(d0_t_n.mData.data());
+    d1_device_buf.ToDevice(d1_e_n.mData.data());
+    d2_device_buf.ToDevice(d2_e_n.mData.data());
+    e_device_buf.ToDevice(e_t_n_device_result.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+
+    preShuffleBuffer(b0_e_n_k.mData.data(),
+                     b0_preshuffled.mData.data(),
+                     N * experts,
+                     K,
+                     device_op.GetPreShuffleParameters());
+
+#if CK_USE_PK4_LAYOUT_SHUFFLE
+    // vector pk_i4x4 permute
+    for(int e = 0; e < experts; e++)
+    {
+        for(int i = 0; i < N; i++)
+        {
+            for(int j = 0; j < K; j += 8)
+            {
+                int input[8];
+
+                for(int k = 0; k < 4; k++)
+                {
+                    int i4x2         = b0_preshuffled(e, j + k * 2, i).data;
+                    input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
+                    input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
+                }
+
+                // permute 01234567->20643175
+                {
+                    int hi   = input[2];
+                    int lo   = input[0];
+                    int i4x2 = (hi << 4) | lo;
+
+                    b0_preshuffled(e, j + 0, i) = i4x2;
+                }
+
+                {
+                    int hi   = input[6];
+                    int lo   = input[4];
+                    int i4x2 = (hi << 4) | lo;
+
+                    b0_preshuffled(e, j + 2, i) = i4x2;
+                }
+
+                {
+                    int hi   = input[3];
+                    int lo   = input[1];
+                    int i4x2 = (hi << 4) | lo;
+
+                    b0_preshuffled(e, j + 4, i) = i4x2;
+                }
+
+                {
+                    int hi   = input[7];
+                    int lo   = input[5];
+                    int i4x2 = (hi << 4) | lo;
+
+                    b0_preshuffled(e, j + 6, i) = i4x2;
+                }
+            }
+        }
+    }
+#endif
+
+    b0_device_buf.ToDevice(b0_preshuffled.mData.data());
+
+    auto invoker = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(sorted_token_ids_dev.GetDeviceBuffer(),
+                               expert_ids_dev.GetDeviceBuffer(),
+                               max_token_id_dev.GetDeviceBuffer(),
+                               a0_device_buf.GetDeviceBuffer(),
+                               b0_device_buf.GetDeviceBuffer(),
+                               std::array<const void*, NumDTensor>{d0_device_buf.GetDeviceBuffer(),
+                                                                   d1_device_buf.GetDeviceBuffer(),
+                                                                   d2_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               tokens,
+                               topk,
+                               sorted_size,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               StrideDs,
+                               StrideE,
+                               KBatch,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    {
+        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+    }
+
+    if(time_kernel)
+    {
+        // not result correct here because output buf not setzero
+        float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+        std::size_t flop      = std::size_t(2) * tokens * topk * N * K;
+        std::size_t num_btype = sizeof(A0DataType) * tokens * K * topk +
+                                sizeof(B0DataType) / 2 * K * N * experts +
+                                sizeof(EDataType) * tokens * N;
+
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s" << device_op.GetTypeString() << std::endl;
+    }
+
+    if(do_verification)
+    {
+        // gemm2 use atomic, so need to reinit outputs
+        e_device_buf.ToDevice(e_t_n_device_result.mData.data());
+        invoker.Run(argument, StreamConfig{nullptr, false, 0, 0, 1});
+
+        Tensor<CShuffleDataType> c_t_n({tokens, N});
+
+        using ReferenceGemmInstance =
+            ck::tensor_operation::host::ReferenceMoeGemm2<A0DataType,
+                                                          B0DataType,
+                                                          D0DataType,
+                                                          D1DataType,
+                                                          D2DataType,
+                                                          CShuffleDataType,
+                                                          AccDataType,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          CDEElementOp,
+                                                          MulRoutedWeight>;
+
+        auto ref_moe_gemm = ReferenceGemmInstance{};
+        auto ref_invoker  = ref_moe_gemm.MakeInvoker();
+        auto ref_argument = ref_moe_gemm.MakeArgument(sorted_token_ids,
+                                                      expert_ids,
+                                                      max_token_id,
+                                                      MPerBlock,
+                                                      a0_t_k_k,
+                                                      b0_e_n_k,
+                                                      d0_t_n,
+                                                      d1_e_n,
+                                                      d2_e_n,
+                                                      c_t_n,
+                                                      PassThrough{},
+                                                      PassThrough{},
+                                                      cde_element_op);
+
+        ref_invoker.Run(ref_argument);
+        for(int t = 0; t < tokens; ++t)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                e_t_n_host_result(t, n) = ck::type_convert<EDataType>(c_t_n(t, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_t_n_device_result.mData.data());
+
+        return ck::utils::check_err(
+                   e_t_n_device_result, e_t_n_host_result, "Error: Incorrect results!", 1e-3, 5e-2)
+                   ? 0
+                   : 1;
+    }
+
+    return 0;
+}
diff --git a/example/67_gemm_microscaling/CMakeLists.txt b/example/67_gemm_microscaling/CMakeLists.txt
index 93770684df..34125465a9 100644
--- a/example/67_gemm_microscaling/CMakeLists.txt
+++ b/example/67_gemm_microscaling/CMakeLists.txt
@@ -3,3 +3,6 @@ add_custom_target(example_gemm_mx)
 add_example_executable(example_gemm_mx_fp8 gemm_mx_fp8.cpp)
 add_example_dependencies(example_gemm_mx example_gemm_mx_fp8)
 
+add_example_executable(example_gemm_mx_bf8 gemm_mx_bf8.cpp)
+add_example_dependencies(example_gemm_mx example_gemm_mx_bf8)
+
diff --git a/example/67_gemm_microscaling/README.md b/example/67_gemm_microscaling/README.md
index c0a0972db6..57b6490eda 100644
--- a/example/67_gemm_microscaling/README.md
+++ b/example/67_gemm_microscaling/README.md
@@ -2,15 +2,23 @@
 
 ## example_gemm_mx_fp8
 
+Custom verification parameters:
 ```bash
 # arg1: verification (0=no, 1=CPU)
-# arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+# arg2: initialization (0=constant values, 1=integer values, 2=decimal values)
 # arg3: time kernel (0=no, 1=yes)
 # arg4: verbosity (0=no info, 1=verbose info)
-# arg5 to 10: M (16x), N(16x), K(16x), StrideA, StrideB, StrideC
+# arg5 to 10: M(128x), N(128x), K(64x), StrideA, StrideB, StrideC
+# arg11: KBatch
 ./bin/example_gemm_mx_fp8 1 1 0 1
 ```
 
+Custom tensor shapes:
+```bash
+./bin/example_gemm_mx_fp8 1 2 1 0 128  128  256 -1 -1 -1 1
+```
+
+Default invocation:
 ```bash
 # Implies: ./bin/example_gemm_mx_fp8 1 2 0 0
 ./bin/example_gemm_mx_fp8
diff --git a/example/67_gemm_microscaling/gemm_mx_bf8.cpp b/example/67_gemm_microscaling/gemm_mx_bf8.cpp
new file mode 100644
index 0000000000..8e341fb591
--- /dev/null
+++ b/example/67_gemm_microscaling/gemm_mx_bf8.cpp
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gemm_mx_common.hpp"
+
+using ADataType = ck::bf8_t;
+using BDataType = ck::bf8_t;
+
+using XDataType = ck::e8m0_bexp_t;
+
+using CDataType        = ck::bhalf_t;
+using AccDataType      = float;
+using CShuffleDataType = CDataType;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough; // elementwise transformation for A matrix
+using BElementOp = PassThrough; // elementwise transformation for B matrix
+using CElementOp = PassThrough; // elementwise transformation for C matrix
+
+constexpr ck::index_t ScaleBlockSize = 32; // scaling block size
+constexpr ck::index_t KPerBlock      = 128;
+
+constexpr auto GemmSpec      = ck::tensor_operation::device::GemmSpecialization::Default;
+constexpr auto BlkGemmPSched = ck::BlockGemmPipelineScheduler::Intrawave;
+constexpr auto BlkGemmPVer   = ck::BlockGemmPipelineVersion::v1;
+
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMX_Xdl_CShuffleV3<
+    ALayout,          // ALayout
+    BLayout,          // BLayout
+    CLayout,          // CLayout
+    ADataType,        // ADataType
+    XDataType,        // AScaleDataType
+    BDataType,        // BDataType
+    XDataType,        // BScaleDataType
+    CDataType,        // CDataType
+    AccDataType,      // GemmAccDataType
+    CShuffleDataType, // CShuffleDataType
+    AElementOp,       // AElementwiseOperation
+    BElementOp,       // BElementwiseOperation
+    CElementOp,       // CElementwiseOperation
+    GemmSpec,         // GemmSpec
+    ScaleBlockSize,   // ScaleBlockSize: Scaling block size
+    128,              // BlockSize: Thread block size
+    128,              // MPerBlock
+    16,               // NPerBlock
+    KPerBlock,        // KPerBlock
+    16,               // AK1
+    16,               // BK1
+    16,               // MPerXDL
+    16,               // NPerXDL
+    4,                // MXdlPerWave
+    1,                // NXdlPerWave
+    S<8, 16, 1>,      // ABlockTransferThreadClusterLengths_AK0_M_AK1
+    S<1, 0, 2>,       // ABlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,       // ABlockTransferSrcAccessOrder
+    2,                // ABlockTransferSrcVectorDim
+    16,               // ABlockTransferSrcScalarPerVector
+    16,               // ABlockTransferDstScalarPerVector_AK1
+    false,            // ABlockLdsExtraM
+    S<8, 16, 1>,      // BBlockTransferThreadClusterLengths_BK0_N_BK1
+    S<1, 0, 2>,       // BBlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,       // BBlockTransferSrcAccessOrder
+    2,                // BBlockTransferSrcVectorDim
+    16,               // BBlockTransferSrcScalarPerVector
+    16,               // BBlockTransferDstScalarPerVector_BK1
+    false,            // BBlockLdsExtraN
+    1,                // CShuffleMXdlPerWavePerShuffle
+    1,                // CShuffleNXdlPerWavePerShuffle
+    S<1, 16, 1, 8>,   // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+    2,                // CShuffleBlockTransferScalarPerVector_NPerBlock
+    BlkGemmPSched,    // BlkGemmPipeSched
+    BlkGemmPVer,      // BlkGemmPipelineVer
+    ADataType,        // ComputeTypeA
+    BDataType         // ComputeTypeB
+    >;
+
+int main(int argc, char* argv[])
+{
+    return run_mx_gemm_example<DeviceOpInstance,
+                               ADataType,
+                               BDataType,
+                               XDataType,
+                               CDataType,
+                               ALayout,
+                               BLayout,
+                               CLayout,
+                               AElementOp,
+                               BElementOp,
+                               CElementOp,
+                               AccDataType,
+                               CShuffleDataType,
+                               ScaleBlockSize>(argc, argv)
+               ? 0
+               : -1;
+}
diff --git a/example/67_gemm_microscaling/gemm_mx_common.hpp b/example/67_gemm_microscaling/gemm_mx_common.hpp
index 30f03cb53b..32ef975192 100644
--- a/example/67_gemm_microscaling/gemm_mx_common.hpp
+++ b/example/67_gemm_microscaling/gemm_mx_common.hpp
@@ -9,20 +9,17 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/utility/blkgemmpipe_scheduler.hpp"
 #include "ck/utility/data_type.hpp"
 #include "ck/utility/sequence.hpp"
-
 #include "ck/library/reference_tensor_operation/cpu/reference_mx_gemm.hpp"
-
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/fill.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 
-using ScaleDataType = ck::e8m0_bexp_t;
-
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
 
@@ -31,16 +28,19 @@ using Col = ck::tensor_layout::gemm::ColumnMajor;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
+using ck::type_convert;
+
 struct ExecutionConfig final
 {
     int do_verification = 1;     // (0=no, 1=CPU)
-    int init_method     = 2;     // (0=no init, 1=integer value, 2=decimal value)
+    int init_method     = 2;     // (0=constant values, 1=integer values, 2=decimal values)
     bool time_kernel    = false; // (0=no, 1=yes)
     int verbosity       = 0;     // (0=no info, 1=verbose info)
 };
 
-struct ProblemSize final
+struct ProblemSizeSplitK final
 {
+
     ck::index_t M = 3840;
     ck::index_t N = 4096;
     ck::index_t K = 4096;
@@ -48,9 +48,14 @@ struct ProblemSize final
     ck::index_t StrideA = -1;
     ck::index_t StrideB = -1;
     ck::index_t StrideC = -1;
+
+    ck::index_t KBatch = 1;
 };
 
-bool parse_cmd_args(int argc, char* argv[], ProblemSize& problem_size, ExecutionConfig& config)
+bool parse_cmd_args(int argc,
+                    char* argv[],
+                    ProblemSizeSplitK& problem_size,
+                    ExecutionConfig& config)
 {
     if(argc == 1)
     {
@@ -63,7 +68,7 @@ bool parse_cmd_args(int argc, char* argv[], ProblemSize& problem_size, Execution
         config.time_kernel     = std::stoi(argv[3]);
         config.verbosity       = std::stoi(argv[4]);
     }
-    else if(argc == 11)
+    else if(argc >= 11)
     {
         config.do_verification = std::stoi(argv[1]);
         config.init_method     = std::stoi(argv[2]);
@@ -77,78 +82,43 @@ bool parse_cmd_args(int argc, char* argv[], ProblemSize& problem_size, Execution
         problem_size.StrideA = std::stoi(argv[8]);
         problem_size.StrideB = std::stoi(argv[9]);
         problem_size.StrideC = std::stoi(argv[10]);
+
+        if(argc >= 12)
+        {
+            problem_size.KBatch = std::stoi(argv[11]);
+        }
     }
     else
     {
         std::cerr << "arg1: verification (0=no, 1=CPU)" << std::endl
-                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
+                  << "arg2: initialization (0=constant values, 1=integer values, 2=decimal values)"
                   << std::endl
                   << "arg3: time kernel (0=no, 1=yes)" << std::endl
                   << "arg4: verbosity (0=no info, 1=verbose info)" << std::endl
-                  << "arg5 to 10: M (16x), N(16x), K(16x), StrideA, StrideB, StrideC" << std::endl;
+                  << "arg5 to 10: M(128x), N(128x), K(256x), StrideA, StrideB, StrideC" << std::endl
+                  << "arg11: KBatch" << std::endl;
         return false;
     }
 
     return true;
 }
 
-template <typename ADataType,
+template <typename DeviceOpInstance,
+          typename ADataType,
           typename BDataType,
           typename XDataType,
           typename CDataType,
           typename ALayout,
           typename BLayout,
           typename CLayout,
-          typename CElementWiseOp,
+          typename AElementOp,
+          typename BElementOp,
+          typename CElementOp,
           typename AccDataType,
           typename CShuffleDataType,
-          ck::index_t MXVectorSize>
-bool run_mx_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+          ck::index_t ScaleBlockSize>
+bool run_mx_gemm(const ProblemSizeSplitK& problem_size, const ExecutionConfig& config)
 {
-    using ELayout      = CLayout;
-    using DsLayout     = ck::Tuple<>;
-    using DsDataType   = ck::Tuple<>;
-    using AElementOp   = PassThrough;
-    using BElementOp   = PassThrough;
-    using CDEElementOp = CElementWiseOp;
-
-    static constexpr auto GemmSpec      = ck::tensor_operation::device::GemmSpecialization::Default;
-    static constexpr auto BlkGemmPSched = ck::BlockGemmPipelineScheduler::Intrawave;
-    static constexpr auto BlkGemmPVer   = ck::BlockGemmPipelineVersion::v3;
-
-#if 1
-    // XXX: These parameters should not exist in MX-native GEMM kernel
-    static constexpr ck::index_t Scale_Block_M = 128;
-    static constexpr ck::index_t Scale_Block_N = 128;
-#endif
-    static constexpr ck::index_t Scale_Block_K = MXVectorSize;
-
-    // XXX: DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3 is not designed to utilize MX-specific MFMA
-    //      instructions.
-    //
-    // XXX: DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3 is not designed to utilize device-optimized
-    //      scaled type convert functions.
-    //
-    // XXX: In DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3, KPerBlock is expected to be equal to
-    //      ScaleBlockK (aka MXVectorSize).
-    //      Additionally, the following is also expected:
-    //         static_assert(ScaleBlockM % MPerBlock == 0);
-    //         static_assert(ScaleBlockN % NPerBlock == 0);
-    //         In MX-native GEMM kernel these requirements should be relaxed.
-    //
-    // XXX: It appears, by default we are using mfma_f32_16x16x4xf32
-    //      MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeB>::selected_mfma.k_per_blk =
-    //          MfmaSelector<float, 16, 16, float>::selected_mfma.k_per_blk = mfma_f32_16x16x4xf32
-    // XXX: GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3 assumes scale type is float
-
-    // clang-format off
-    using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
-    // ######| ALayout| BLayout| DsLayout| CLayout| ADataType|    AScale| BDataType|    BScale| DsDataType| CDataType|     GemmAcc| CShuffleDataType|AElementwise|BElementwise| CElementwise| GemmSpec|Block|   ScaleBlockM|   ScaleBlockN|   ScaleBlockK|    M|    N|             K| AK1| BK1|   M|   N|MXdl|NXdl|ABlockTransfer|ABlockTransfer|ABlockTransfer|ABlockTransfer|ABlockTransfer|ABlockTransfer|   ABlock|BBlockTransfer|BBlockTransfer|BBlockTransfer|BBlockTransfer|BBlockTransfer|BBlockTransfer|   BBlock|  CShuffle|  CShuffle|CShuffleBlockTransfer|CDEShuffleBlockTransfer|       BlkGemm|     BlkGemm|ComputeTypeA|ComputeTypeB|LDSTypeA|LDSTypeB|
-    // ######|        |        |         |        |          |  DataType|          |  DataType|           |          |    DataType|                 |   Operation|   Operation|    Operation|         | Size|              |              |              |  Per|  Per|           Per|    |    | Per| Per| Per| Per| ThreadCluster| ThreadCluster|SrcAccessOrder|  SrcVectorDim|     SrcScalar|     DstScalar|LdsExtraM| ThreadCluster| ThreadCluster|SrcAccessOrder|     SrcVector|     SrcScalar|     DstScalar|LdsExtraN|      MXdl|      NXdl|       ClusterLengths|                 Scalar|     PipeSched| PipelineVer|            |            |        |        |
-    // ######|        |        |         |        |          |          |          |          |           |          |            |                 |            |            |             |         |     |              |              |              |Block|Block|         Block|    |    | XDL| XDL|Wave|Wave|       Lengths|  ArrangeOrder|              |              |     PerVector| PerVector_AK1|         |       Lengths|  ArrangeOrder|              |           Dim|     PerVector| PerVector_BK1|         |   PerWave|   PerWave|     MBlock_MPerBlock|             PerVectors|              |            |            |            |        |        |
-    // ######|        |        |         |        |          |          |          |          |           |          |            |                 |            |            |             |         |     |              |              |              |     |     |              |    |    |    |    |    |    |     AK0_M_AK1|              |              |              |              |              |         |     BK0_N_BK1|              |              |                             |              |         |PerShuffle|PerShuffle|     NBlock_NPerBlock|                       |              |            |            |            |        |        |
-             < ALayout, BLayout, DsLayout, ELayout, ADataType, XDataType, BDataType, XDataType, DsDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp, CDEElementOp, GemmSpec,  256, Scale_Block_M, Scale_Block_N, Scale_Block_K,  128,  128,           128,  16,  16,  16,  16,   4,   4,   S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,             2,            16,            16,        0,   S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,             2,            16,            16,        0,         1,         2,       S<1, 32, 1, 8>,             S<8, 8, 1>, BlkGemmPSched, BlkGemmPVer,       float,       float,    float,  float>;
-    // clang-format on
 
     auto M       = problem_size.M;
     auto N       = problem_size.N;
@@ -156,6 +126,7 @@ bool run_mx_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
     auto StrideA = problem_size.StrideA;
     auto StrideB = problem_size.StrideB;
     auto StrideC = problem_size.StrideC;
+    auto KBatch  = problem_size.KBatch;
 
     auto f_host_tensor_descriptor =
         [](ck::index_t row, ck::index_t col, ck::index_t stride, auto layout) {
@@ -191,21 +162,26 @@ bool run_mx_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
     StrideB = f_get_default_stride(K, N, StrideB, BLayout{});
     StrideC = f_get_default_stride(M, N, StrideC, CLayout{});
 
-    if(K % Scale_Block_K != 0)
+    if(K % ScaleBlockSize != 0)
     {
-        throw std::runtime_error("wrong! K must be multiple of Scale_Block_K (16 or 32)");
+        throw std::runtime_error("wrong! K must be multiple of ScaleBlockSize.");
     };
 
-    auto Scale_Stride_AM = f_get_default_stride(M, K / Scale_Block_K, StrideA, ALayout{});
-    auto Scale_Stride_BN = f_get_default_stride(K / Scale_Block_K, N, StrideB, BLayout{});
+    // Hardcode scale layouts as per pipeline assumptions
+    // TODO: Allow user to specify scale layouts
+    using AScaleLayout = Row;
+    using BScaleLayout = Col;
+
+    auto Scale_Stride_AM = f_get_default_stride(M, K / ScaleBlockSize, -1, AScaleLayout{});
+    auto Scale_Stride_BN = f_get_default_stride(K / ScaleBlockSize, N, -1, BScaleLayout{});
 
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
 
-    Tensor<XDataType> a_m_k_scale(
-        f_host_tensor_descriptor(M, K / Scale_Block_K, Scale_Stride_AM, ALayout{})); // scales for A
-    Tensor<XDataType> b_k_n_scale(
-        f_host_tensor_descriptor(K / Scale_Block_K, N, Scale_Stride_BN, BLayout{})); // scales for B
+    Tensor<XDataType> a_m_k_scale(f_host_tensor_descriptor(
+        M, K / ScaleBlockSize, Scale_Stride_AM, AScaleLayout{})); // scales for A
+    Tensor<XDataType> b_k_n_scale(f_host_tensor_descriptor(
+        K / ScaleBlockSize, N, Scale_Stride_BN, BScaleLayout{})); // scales for B
 
     Tensor<CDataType> c_m_n_host_result(
         f_host_tensor_descriptor(M, N, StrideC, CLayout{})); // host verification
@@ -223,28 +199,49 @@ bool run_mx_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
 
     switch(config.init_method)
     {
-    case 0:
-        if(config.verbosity > 0)
-        {
-            std::cout << "NOTE: No input data initialization." << std::endl;
-        }
-        break;
-    case 1:
-    case 2:
+    case 0: // Initializations for development and debugging
         ck::utils::FillConstant<ADataType>{ck::type_convert<ADataType>(1.0f)}(a_m_k);
-        ck::utils::FillConstant<XDataType>{ck::type_convert<XDataType>(0.5f)}(a_m_k_scale);
-        ck::utils::FillConstant<BDataType>{ck::type_convert<BDataType>(1.0f)}(b_k_n);
-        ck::utils::FillConstant<XDataType>{ck::type_convert<XDataType>(2.0f)}(b_k_n_scale);
+        ck::utils::FillConstant<XDataType>{ck::type_convert<XDataType>(2.0f)}(a_m_k_scale);
+        ck::utils::FillConstant<BDataType>{ck::type_convert<BDataType>(0.5f)}(b_k_n);
+        ck::utils::FillConstant<XDataType>{ck::type_convert<XDataType>(1.0f)}(b_k_n_scale);
         if(config.verbosity > 0)
         {
             std::cout << "Init A = {1}" << std::endl;
-            std::cout << "Init A scale = {0.5}" << std::endl;
-            std::cout << "Init B = {1}" << std::endl;
-            std::cout << "Init B scale = {2.0}" << std::endl;
+            std::cout << "Init A scale = {2.0}" << std::endl;
+            std::cout << "Init B = {0.5}" << std::endl;
+            std::cout << "Init B scale = {1.0}" << std::endl;
             std::cout << "Expect C = {K}" << std::endl;
         }
         break;
 
+    case 1:
+
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 6}); // Z[-5,5]
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 6}); // Z[-5,5]
+
+        if constexpr(ck::is_same_v<XDataType, ck::e8m0_bexp_t>)
+        {
+            a_m_k_scale.GenerateTensorValue(
+                GeneratorTensor_2<XDataType>{125, 129}); // scales: {0.25, 0.5, 1, 2}
+            b_k_n_scale.GenerateTensorValue(
+                GeneratorTensor_2<XDataType>{125, 129}); // scales: {0.25, 0.5, 1, 2}
+        }
+        else
+        {
+            ck::utils::FillUniformDistributionIntegerValue<XDataType>{-1.0f, 1.0f}(a_m_k_scale);
+            ck::utils::FillUniformDistributionIntegerValue<XDataType>{-1.0f, 1.0f}(b_k_n_scale);
+        }
+
+        break;
+
+    case 2:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<BDataType>{-2.0, 2.0});
+        a_m_k_scale.GenerateTensorValue(GeneratorTensor_3<XDataType>{powf(2.0f, -125.0f), 1.0f});
+
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-2.0, 2.0});
+        b_k_n_scale.GenerateTensorValue(GeneratorTensor_3<XDataType>{powf(2.0f, -125.0f), 1.0f});
+        break;
+
     default:
         if(config.verbosity > 0)
         {
@@ -269,31 +266,31 @@ bool run_mx_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
     if(config.verbosity > 0)
         std::cout << "Done." << std::endl;
 
-    auto a_element_op   = AElementOp{};
-    auto b_element_op   = BElementOp{};
-    auto cde_element_op = CDEElementOp{};
+    auto a_element_op = AElementOp{};
+    auto b_element_op = BElementOp{};
+    auto c_element_op = CElementOp{};
 
-    constexpr ck::index_t NumDTensor = DsDataType::Size();
-
-    // do GEMM
+    // run GEMM
     auto device_op = DeviceOpInstance{};
     auto invoker   = device_op.MakeInvoker();
-    auto argument  = device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
-                                           b_device_buf.GetDeviceBuffer(),
-                                           std::array<const void*, NumDTensor>{},
-                                           c_device_buf.GetDeviceBuffer(),
-                                           M,
-                                           N,
-                                           K,
-                                           StrideA,
-                                           StrideB,
-                                           std::array<ck::index_t, NumDTensor>{},
-                                           StrideC,
-                                           a_scale_device_buf.GetDeviceBuffer(),
-                                           b_scale_device_buf.GetDeviceBuffer(),
-                                           a_element_op,
-                                           b_element_op,
-                                           cde_element_op);
+    auto argument =
+        device_op.MakeArgument(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                               static_cast<XDataType*>(a_scale_device_buf.GetDeviceBuffer()),
+                               static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                               static_cast<XDataType*>(b_scale_device_buf.GetDeviceBuffer()),
+                               static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                               M,
+                               N,
+                               K,
+                               StrideA,
+                               Scale_Stride_AM,
+                               StrideB,
+                               Scale_Stride_BN,
+                               StrideC,
+                               KBatch,
+                               a_element_op,
+                               b_element_op,
+                               c_element_op);
 
     if(!device_op.IsSupportedArgument(argument))
     {
@@ -303,7 +300,10 @@ bool run_mx_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
     }
 
     if(config.verbosity > 0)
-        std::cout << "Computing GEMM on device..." << std::endl;
+    {
+        std::cout << "Computing GEMM on device..." << std::endl << std::endl;
+    }
+
     float ave_time =
         invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, config.verbosity, 20, 50});
 
@@ -321,7 +321,7 @@ bool run_mx_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
                                                                                   BDataType,
                                                                                   CDataType,
                                                                                   AccDataType,
-                                                                                  float,
+                                                                                  XDataType,
                                                                                   PassThrough,
                                                                                   PassThrough,
                                                                                   PassThrough,
@@ -347,12 +347,15 @@ bool run_mx_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
             std::cout << "Comparing results..." << std::endl;
         }
 
-        if(config.init_method == 1)
+        if(config.init_method == 0)
         {
-            res_verified =
-                res_verified && std::abs(static_cast<float>(K) - c_m_n_device_result(0, 0)) <= 0.0f;
-            std::cout << "Expected vs Computed: " << 1.0f * K << " vs " << c_m_n_device_result(0, 0)
-                      << ((res_verified) ? " (PASSED!)" : " (FAILED!)") << std::endl;
+            auto expected = static_cast<float>(K);
+            auto computed = type_convert<float>(c_m_n_device_result(1, 12));
+
+            res_verified = res_verified && std::abs(expected - computed) <= 0.0f;
+            std::cout << "\nExpected vs Computed: " << expected << " vs " << computed
+                      << ((res_verified) ? " (PASSED!)" : " (FAILED!)") << std::endl
+                      << std::endl;
         }
 
         res_verified = res_verified && ck::utils::check_err(c_m_n_device_result,
@@ -360,7 +363,7 @@ bool run_mx_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
                                                             "Error: Incorrect results!");
 
         if(config.verbosity > 0 && res_verified)
-            std::cout << "Done." << std::endl;
+            std::cout << "Verification Successful!" << std::endl;
     }
     else
     {
@@ -370,47 +373,56 @@ bool run_mx_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
 
     if(config.time_kernel)
     {
-        std::size_t flop = std::size_t(2) * M * N * K + M * K + K * N; // GEMM + A scale + B scale
+        // Output size(M*N) * [dot product(2K) + product of scales(K/ScaleBlockSize) + scaling of
+        // partial sums(K/ScaleBlockSize)]
+        // FLOPS = 2 * M * N * K + 2 * M * N * K / ScaleBlockSize
+        std::size_t flop = std::size_t(2) * M * N * K + std::size_t(2) * M * N * K / ScaleBlockSize;
         std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
                                 sizeof(CDataType) * M * N +
-                                sizeof(XDataType) * (M * K + K * N) / Scale_Block_K;
+                                sizeof(XDataType) * (M * K + K * N) / ScaleBlockSize;
 
         float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
 
         float gb_per_sec = num_btype / 1.E6 / ave_time;
 
         std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
-                  << " GB/s" << std::endl;
+                  << " GB/s, " << device_op.GetTypeString() << std::endl;
     }
 
     return res_verified;
 }
 
-template <typename ADataType,
+template <typename DeviceOpInstance,
+          typename ADataType,
           typename BDataType,
           typename XDataType,
           typename CDataType,
           typename ALayout,
           typename BLayout,
           typename CLayout,
-          typename CElementWiseOp,
+          typename AElementOp,
+          typename BElementOp,
+          typename CElementOp,
           typename AccDataType,
           typename CShuffleDataType,
           ck::index_t MXVectorSize>
 bool run_mx_gemm_example(int argc, char* argv[])
 {
-    ProblemSize problem_size;
+    ProblemSizeSplitK problem_size;
     ExecutionConfig config;
 
     return parse_cmd_args(argc, argv, problem_size, config) &&
-           run_mx_gemm<ADataType,
+           run_mx_gemm<DeviceOpInstance,
+                       ADataType,
                        BDataType,
                        XDataType,
                        CDataType,
                        ALayout,
                        BLayout,
                        CLayout,
-                       CElementWiseOp,
+                       AElementOp,
+                       BElementOp,
+                       CElementOp,
                        AccDataType,
                        CShuffleDataType,
                        MXVectorSize>(problem_size, config);
diff --git a/example/67_gemm_microscaling/gemm_mx_fp8.cpp b/example/67_gemm_microscaling/gemm_mx_fp8.cpp
index d2e21698ec..9fc5666197 100644
--- a/example/67_gemm_microscaling/gemm_mx_fp8.cpp
+++ b/example/67_gemm_microscaling/gemm_mx_fp8.cpp
@@ -5,37 +5,94 @@
 
 using ADataType = ck::f8_t;
 using BDataType = ck::f8_t;
-#if 1
-// XXX: MX-native GEMM kernel will work with e8m0_bexp_t scale type
-using XDataType = float;
-#else
+
 using XDataType = ck::e8m0_bexp_t;
-#endif
+
+using CDataType        = ck::half_t;
 using AccDataType      = float;
-using CShuffleDataType = float;
-using CDataType        = float;
+using CShuffleDataType = CDataType;
 
 using ALayout = Row;
 using BLayout = Col;
 using CLayout = Row;
 
+using AElementOp = PassThrough; // elementwise transformation for A matrix
+using BElementOp = PassThrough; // elementwise transformation for B matrix
 using CElementOp = PassThrough; // elementwise transformation for C matrix
 
-constexpr ck::index_t mx_vector_size = 128; // scaling block size
+constexpr ck::index_t ScaleBlockSize = 32; // scaling block size
+constexpr ck::index_t KPerBlock      = 256;
+
+constexpr auto GemmSpec      = ck::tensor_operation::device::GemmSpecialization::Default;
+constexpr auto BlkGemmPSched = ck::BlockGemmPipelineScheduler::Intrawave;
+constexpr auto BlkGemmPVer   = ck::BlockGemmPipelineVersion::v1;
+
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMX_Xdl_CShuffleV3<
+    ALayout,          // ALayout
+    BLayout,          // BLayout
+    CLayout,          // CLayout
+    ADataType,        // ADataType
+    XDataType,        // AScaleDataType
+    BDataType,        // BDataType
+    XDataType,        // BScaleDataType
+    CDataType,        // CDataType
+    AccDataType,      // GemmAccDataType
+    CShuffleDataType, // CShuffleDataType
+    AElementOp,       // AElementwiseOperation
+    BElementOp,       // BElementwiseOperation
+    CElementOp,       // CElementwiseOperation
+    GemmSpec,         // GemmSpec
+    ScaleBlockSize,   // ScaleBlockSize: Scaling block size
+    256,              // BlockSize: Thread block size
+    128,              // MPerBlock
+    128,              // NPerBlock
+    KPerBlock,        // KPerBlock
+    16,               // AK1
+    16,               // BK1
+    32,               // MPerXDL
+    32,               // NPerXDL
+    2,                // MXdlPerWave
+    2,                // NXdlPerWave
+    S<4, 64, 1>,      // ABlockTransferThreadClusterLengths_AK0_M_AK1
+    S<1, 0, 2>,       // ABlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,       // ABlockTransferSrcAccessOrder
+    2,                // ABlockTransferSrcVectorDim
+    16,               // ABlockTransferSrcScalarPerVector
+    16,               // ABlockTransferDstScalarPerVector_AK1
+    false,            // ABlockLdsExtraM
+    S<4, 64, 1>,      // BBlockTransferThreadClusterLengths_BK0_N_BK1
+    S<1, 0, 2>,       // BBlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,       // BBlockTransferSrcAccessOrder
+    2,                // BBlockTransferSrcVectorDim
+    16,               // BBlockTransferSrcScalarPerVector
+    16,               // BBlockTransferDstScalarPerVector_BK1
+    false,            // BBlockLdsExtraN
+    1,                // CShuffleMXdlPerWavePerShuffle
+    1,                // CShuffleNXdlPerWavePerShuffle
+    S<1, 32, 1, 8>,   // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+    8,                // CShuffleBlockTransferScalarPerVector_NPerBlock
+    BlkGemmPSched,    // BlkGemmPipeSched
+    BlkGemmPVer,      // BlkGemmPipelineVer
+    ADataType,        // ComputeTypeA
+    BDataType         // ComputeTypeB
+    >;
 
 int main(int argc, char* argv[])
 {
-    return run_mx_gemm_example<ADataType,
+    return run_mx_gemm_example<DeviceOpInstance,
+                               ADataType,
                                BDataType,
                                XDataType,
                                CDataType,
                                ALayout,
                                BLayout,
                                CLayout,
+                               AElementOp,
+                               BElementOp,
                                CElementOp,
                                AccDataType,
                                CShuffleDataType,
-                               mx_vector_size>(argc, argv)
+                               ScaleBlockSize>(argc, argv)
                ? 0
                : -1;
 }
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index bcb62df625..996a543ecc 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -104,14 +104,24 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
     endforeach()
+    # Do not build gemm_universal_f8 or gemm_multiply_multiply_f8 for any targets except gfx94
+    foreach(source IN LISTS FILE_NAME)
+    if(NOT EX_TARGETS MATCHES "gfx94" AND NOT EX_TARGETS MATCHES "gfx95" AND source MATCHES "gemm_multiply_multiply_xdl_fp8_bpreshuffle")
+         message("Skipping ${source} example for current target")
+         list(REMOVE_ITEM FILE_NAME "${source}")
+    endif()
+    endforeach()
     #only continue if there are some source files left on the list
     if(FILE_NAME)
-        if(FILE_NAME MATCHES "_xdl")
-            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+        if(FILE_NAME MATCHES "_xdl" AND NOT FILE_NAME MATCHES "_pk_i4")
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
         elseif(FILE_NAME MATCHES "_wmma")
-            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx950)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx950)
         elseif(FILE_NAME MATCHES "_mx") #only build mx example for gfx950
-            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+        elseif(FILE_NAME MATCHES "_pk_i4") #only build these examples for gfx942 and gfx950
+            message("trimming targets for ${FILE_NAME}")
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
         endif()
         set_source_files_properties(${FILE_NAME} PROPERTIES LANGUAGE HIP)
         add_executable(${EXAMPLE_NAME} ${FILE_NAME})
@@ -202,9 +212,9 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
     #only continue if there are some source files left on the list
     if(FILE_NAME)
         if(FILE_NAME MATCHES "_xdl")
-            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
         elseif(FILE_NAME MATCHES "_wmma")
-            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx950)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx950)
         endif()
         set_source_files_properties(${FILE_NAME} PROPERTIES LANGUAGE HIP)
         add_executable(${EXAMPLE_NAME} ${FILE_NAME})
diff --git a/example/ck_tile/01_fmha/README.md b/example/ck_tile/01_fmha/README.md
index e9806e7a67..12414a20ed 100644
--- a/example/ck_tile/01_fmha/README.md
+++ b/example/ck_tile/01_fmha/README.md
@@ -126,6 +126,6 @@ Note FA use bottom-right by default to express swa case, here we require you exp
 TBD
 
 ## FP8 experimental support
-As described in [this blog](https://blog.hippoml.com/8bit-hippoattention-up-to-3x-faster-compared-to-flashattentionv2-8f9def90b482), we have an experimental support for fp8 fmha kernels, you can evaluate the performance by setting the arg `-prec=fp8` to the `tile_example_fmha_fwd`, on a gfx940/941/942 machine and ROCm 6.0+.
+As described in [this blog](https://blog.hippoml.com/8bit-hippoattention-up-to-3x-faster-compared-to-flashattentionv2-8f9def90b482), we have an experimental support for fp8 fmha kernels, you can evaluate the performance by setting the arg `-prec=fp8` to the `tile_example_fmha_fwd`, on a gfx942 machine and ROCm 6.0+.
 
 Currently we only support `-vlayout=c`( `hdim*seqlen` for V matrix) and `-squant=1`(static quantization) with `hdim=128` for fp8 now. Full feature support will come later.
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
index 4c23250d05..932f6020b6 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -176,7 +176,8 @@ float fmha_bwd_(const ck_tile::stream_config& s, fmha_bwd_args a)
     );
 }}
 
-float fmha_bwd(fmha_bwd_traits t, fmha_bwd_args a, const ck_tile::stream_config& s){{
+template <>
+float fmha_bwd<2>(fmha_bwd_traits t, fmha_bwd_args a, const ck_tile::stream_config& s){{
     float r = -1;
 {F_dispatch}
     return r;
@@ -412,14 +413,26 @@ class FmhaBwdDQDKDVKernel:
         pn = pad_name()
         n = f"fmha_bwd_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_" + self.F_tile.name + f'_{self.F_pipeline}'
         if pn != '' : n += f'_{pn}'
+        else: n += '_npad'
+
         if self.F_bias != 'no' : n += f'_{self.F_bias}'
+        else: n += '_nbias'
+
         if self.F_dbias == 't' : n += '_dbias'
+        else: n += '_ndbias'
+
         if self.F_mask[0:2] == 's_':
             if self.F_mask == 's_mask': n += f'_mask'
+            else: n += '_nmask'
         else:
             if self.F_mask != 'no' : n += f'_m{self.F_mask[0]}'
+            else: n += '_nmask'
+
         if self.F_dropout != 'no' : n += f'_{self.F_dropout}'
+        else: n += '_ndropout'
+
         if self.F_deterministic == 't' : n += '_deterministic'
+        else: n += '_ndeterministic'
         return n
 
     @property
@@ -489,7 +502,7 @@ def get_bwd_dq_dk_dv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                                 F_spad=spad, F_skpad=skpad, F_dpad=dpad, F_dvpad=dvpad,
                                 F_bias=bias, F_dbias=dbias, F_dropout=dropout, F_mask=mask, F_mode=mode,
                                 F_pipeline=ppl, mask_impl=mask_impl, F_deterministic=deterministic)
-            if kernel_filter != None:
+            if kernel_filter != '':
                 if not fnmatch.fnmatch(k.name, kernel_filter):
                     continue
             # Flash attention integration
@@ -517,23 +530,25 @@ def get_bwd_dq_dk_dv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                     if not cond:
                         continue
             # Aiter (mha_bwd) integration
-            elif receipt == 10:
+            elif receipt == 300:
                     cond = dtype in ['fp16', 'bf16']
                     cond &= mode == "batch"
-                    cond &= bias in ['no', 'alibi']
                     cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
                     cond &= dpad == dvpad
-                    cond &= deterministic == "t"
                     if not cond:
                         continue
             # Aiter (mha_varlen_bwd) integration
-            elif receipt == 11:
+            elif receipt == 400:
                     cond = dtype in ['fp16', 'bf16']
                     cond &= mode == "group"
-                    cond &= bias in ['no', 'alibi']
                     cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
                     cond &= dpad == dvpad
-                    cond &= deterministic == "t"
+                    if not cond:
+                        continue
+            # aiter::mha_bwd C++ api integration
+            elif receipt == 600:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= dpad == dvpad
                     if not cond:
                         continue
             api_pool.register_dq_dk_dv_traits(k.api_trait())
@@ -632,13 +647,14 @@ class FmhaBwdOGradDotOKernel:
         pn = pad_name()
         n = f"fmha_bwd_dot_do_o_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_o{self.F_occupancy}"
         if pn != '' : n += f'_{pn}'
+        else: n += '_npad'
         return n
 
     @property
     def filename(self) -> str:
         return self.name + ".cpp"
 
-def get_bwd_dot_do_o_blobs() -> List[FmhaBwdOGradDotOKernel]:
+def get_bwd_dot_do_o_blobs(kernel_filter : Optional[str], receipt) -> List[FmhaBwdOGradDotOKernel]:
     # TODO: we don't support tuning yet, so pick up one value for pad/occupancy
     #       support this in future
     def get_occupancy(dtype, hdim):
@@ -657,6 +673,26 @@ def get_bwd_dot_do_o_blobs() -> List[FmhaBwdOGradDotOKernel]:
             k = FmhaBwdOGradDotOKernel(F_idx=0, F_hdim=hdim, F_dtype=dtype,
                                 F_spad=spad, F_dvpad=dvpad, F_mode=mode,
                                 F_occupancy=get_occupancy(dtype, hdim))
+            if kernel_filter != '':
+                if not fnmatch.fnmatch(k.name, kernel_filter):
+                    continue
+            # Aiter (mha_bwd) integration
+            if receipt == 300:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= mode == "batch"
+                    if not cond:
+                        continue
+            # Aiter (mha_varlen_bwd) integration
+            elif receipt == 400:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= mode == "group"
+                    if not cond:
+                        continue
+            # aiter::mha_bwd C++ api integration
+            elif receipt == 600:
+                    cond = dtype in ['fp16', 'bf16']
+                    if not cond:
+                        continue
             gen.append(k)
 
     return gen
@@ -766,14 +802,16 @@ class FmhaBwdConvertQGradKernel:
         pn = pad_name()
         n = f"fmha_bwd_convert_dq_d{self.F_hdim}_{self.F_dtype}_b{self.F_bm0}x{self.F_bn0}_{self.F_mode}_o{self.F_occupancy}"
         if pn != '' : n += f'_{pn}'
-        if self.F_deterministic == 't' : n += f'_deterministic'
+        else: n += '_npad'
+        if self.F_deterministic == 't' : n += '_deterministic'
+        else: n += '_ndeterministic'
         return n
 
     @property
     def filename(self) -> str:
         return self.name + ".cpp"
 
-def get_bwd_convert_dq_blobs() -> List[FmhaBwdConvertQGradKernel]:
+def get_bwd_convert_dq_blobs(kernel_filter : Optional[str], receipt) -> List[FmhaBwdConvertQGradKernel]:
     # TODO: we don't support tuning yet, so pick up one value for pad/occupancy
     #       support this in future
     def get_occupancy(dtype, hdim):
@@ -792,6 +830,26 @@ def get_bwd_convert_dq_blobs() -> List[FmhaBwdConvertQGradKernel]:
                 continue
             k = FmhaBwdConvertQGradKernel(F_idx=0, F_hdim=hdim, F_dtype=dtype, F_bm0=64, F_bn0=tile.F_bn0,
                                 F_spad=spad, F_dpad=dpad, F_mode=mode, F_occupancy=get_occupancy(dtype, hdim), F_deterministic=deterministic)
+            if kernel_filter != '':
+                if not fnmatch.fnmatch(k.name, kernel_filter):
+                    continue
+            # Aiter (mha_bwd) integration
+            if receipt == 300:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= mode == "batch"
+                    if not cond:
+                        continue
+            # Aiter (mha_varlen_bwd) integration
+            elif receipt == 400:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= mode == "group"
+                    if not cond:
+                        continue
+            # aiter::mha_bwd C++ api integration
+            elif receipt == 600:
+                    cond = dtype in ['fp16', 'bf16']
+                    if not cond:
+                        continue
             gen.append(k)
 
     return gen
@@ -808,27 +866,37 @@ def write_single_bwd_convert_dq_kernel(kernel: FmhaBwdConvertQGradKernel, autoge
 def write_bwd_api(api_pool : FmhaBwdApiPool, autogen_dir: Path) -> None:
     (autogen_dir / FMHA_BWD_API_FILENAME).write_text(api_pool.api)
 
-def write_blobs(output_dir : Path, kernel_filter : Optional[str], receipt, mask_impl) -> None:
-    kernels = get_bwd_dot_do_o_blobs()
+def write_blobs(output_dir : Path, filter_list : str, receipt, optdim_list, mask_impl) -> None:
+    filter_list = filter_list.split('@')
+    filter_list.extend([''] * (3 - len(filter_list)))
+    # TODO
+    assert optdim_list == [-1]
+
+    kernels = get_bwd_dot_do_o_blobs(filter_list[0], receipt)
     for kernel in kernels:
         write_single_bwd_dot_do_o_kernel(kernel, output_dir)
-    kernels = get_bwd_convert_dq_blobs()
+    kernels = get_bwd_convert_dq_blobs(filter_list[1], receipt)
     for kernel in kernels:
         write_single_bwd_convert_dq_kernel(kernel, output_dir)
-    api_pool, kernels = get_bwd_dq_dk_dv_blobs(kernel_filter, receipt, mask_impl)
+    api_pool, kernels = get_bwd_dq_dk_dv_blobs(filter_list[2], receipt, mask_impl)
     for kernel in kernels:
         write_single_bwd_dq_dk_dv_kernel(kernel, output_dir)
     write_bwd_api(api_pool, output_dir)
 
-def list_blobs(file_path : Path, kernel_filter : Optional[str], receipt, mask_impl) -> None:
+def list_blobs(file_path : Path, filter_list : str, receipt, optdim_list, mask_impl) -> None:
+    filter_list = filter_list.split('@')
+    filter_list.extend([''] * (3 - len(filter_list)))
+    # TODO
+    assert optdim_list == [-1]
+
     with file_path.open('a') as f:
-        kernels = get_bwd_dot_do_o_blobs()
+        kernels = get_bwd_dot_do_o_blobs(filter_list[0], receipt)
         for kernel in kernels:
             f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
-        kernels = get_bwd_convert_dq_blobs()
+        kernels = get_bwd_convert_dq_blobs(filter_list[1], receipt)
         for kernel in kernels:
             f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
-        _, kernels = get_bwd_dq_dk_dv_blobs(kernel_filter, receipt, mask_impl)
+        _, kernels = get_bwd_dq_dk_dv_blobs(filter_list[2], receipt, mask_impl)
         for kernel in kernels:
             f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
         f.write(str(file_path.parent / GEN_DIR / FMHA_BWD_API_FILENAME) + "\n")
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index b72627ed5d..c31a0ce954 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -118,7 +118,7 @@ FMHA_FWD_API_PER_DTYPE="""    {F_if}(t.data_type.compare(\"{F_dtype}\") == 0){{
 {F_hdim_case}
     }}
 """
-FMHA_FWD_API_PER_HDIM_CASE="""        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <= {F_hdim}) {{
+FMHA_FWD_API_PER_HDIM_CASE="""        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <= {F_hdim_v}) {{
 {F_inner_dispatch}
         }}
 """
@@ -233,14 +233,26 @@ class FmhaFwdPipeline:
         pn = pad_name()
         n = f'{self.tag}_v{self.F_vlayout[0]}'
         if pn != '' : n += f'_{pn}'
+        else: n += '_npad'
+
         if self.F_bias != 'no' : n += f'_{self.F_bias}'
+        else: n += '_nbias'
+
         if self.F_mask[0:2] == 's_':
             if self.F_mask == 's_mask': n += f'_mask'
+            else: n += '_nmask'
         else:
             if self.F_mask != 'no' : n += f'_m{self.F_mask[0]}'
+            else: n += '_nmask'
+
         if self.F_lse == 't' : n += '_lse'
+        else: n += '_nlse'
+
         if self.F_dropout == 't' : n += '_dropout'
+        else: n += '_ndropout'
+
         if self.F_squant == 't' : n += '_squant'
+        else: n += '_nsquant'
         return n
 
 class FmhaFwdApiPool:
@@ -276,7 +288,7 @@ class FmhaFwdApiPool:
                                    F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
                                    F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype])
                 if_j = 'if' if j == 0 else 'else if'
-                per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners)
+                per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_hdim_v=trait.bn1, F_inner_dispatch=inners)
             if_i = 'if' if i == 0 else 'else if'
             per_dtypes = per_dtypes + FMHA_FWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case)
         if not per_dtypes:
@@ -405,6 +417,7 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
             '64'  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
         ### '96'  : FmhaFwdTileSize(128, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
             '128' : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+            '192' : FmhaFwdTileSize(128, 128, 32, 128, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
             '256' : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
         }
     elif dtype == 'fp8' or dtype == 'bf8':
@@ -416,7 +429,7 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
     else:
         return None
 
-def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[FmhaFwdApiPool, List[FmhaFwdKernel]]:
+def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl) -> Tuple[FmhaFwdApiPool, List[FmhaFwdKernel]]:
     # TODO: we don't support tuning yet, so pick up one value for vlayout/pipeline/pad
     #       support this in future
     def get_pipelines(dtype, hdim) -> List[FmhaFwdPipeline]:
@@ -432,6 +445,9 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[Fm
                 # if True:
                     pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', bias, lse, dropout, squant, mask))
                     pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, lse, dropout, squant, mask))
+                    # the below two is used for hdim vectorize load
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 'f', 'f', bias, lse, dropout, squant, mask))
+                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 'f', 'f', bias, lse, dropout, squant, mask))
 
                     pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', bias, lse, dropout, squant, mask))
                     pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', bias, lse, dropout, squant, mask))
@@ -477,6 +493,10 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[Fm
                     if pipeline.F_spad != 't' or pipeline.F_skpad != 't':
                         # in group mode, spad/skpad must be true, since we can't predict if seqlen of current batch need pad or not
                         continue
+                if hdim == 192 and tile.F_bn1 == 128:
+                    # NOTE: this is used to speedup deepseek prefill case, we don't gen training
+                    if pipeline.F_bias != 'no' or pipeline.F_lse == 't' or pipeline.F_dropout == 't':
+                        continue
                 k = FmhaFwdKernel(F_idx=0,
                                   F_hdim=hdim,
                                   F_dtype=dtype,
@@ -484,9 +504,12 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[Fm
                                   F_tile=tile,
                                   F_pipeline=pipeline,
                                   mask_impl=mask_impl)
-                if kernel_filter != None:
+                if kernel_filter != '':
                     if not fnmatch.fnmatch(k.name, kernel_filter):
                         continue
+                if optdim_list != [-1]:
+                    if hdim not in optdim_list:
+                        continue
                 # 2 - Flash attention integration
                 if receipt in (2, 3):
                     cond = dtype in ['fp16', 'bf16']
@@ -504,20 +527,25 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[Fm
                     if not cond:
                         continue
                 # Aiter(mha_fwd) integration
-                elif receipt == 10:
+                elif receipt == 100:
                     cond = dtype in ['fp16', 'bf16']
-                    cond &= mode == "batch"
+                    cond &= mode == 'batch'
                     cond &= pipeline.F_vlayout == 'row'
-                    cond &= pipeline.F_bias in ['no', 'alibi']
                     cond &= pipeline.F_squant == 'f'
                     if not cond:
                         continue
                 # Aiter(mha_varlen_fwd) integration
-                elif receipt == 11:
+                elif receipt == 200:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= mode == 'group'
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_squant == 'f'
+                    if not cond:
+                        continue
+                # aiter::mha_fwd C++ api integration
+                elif receipt == 600:
                     cond = dtype in ['fp16', 'bf16']
-                    cond &= mode == "group"
                     cond &= pipeline.F_vlayout == 'row'
-                    cond &= pipeline.F_bias in ['no', 'alibi']
                     cond &= pipeline.F_squant == 'f'
                     if not cond:
                         continue
@@ -532,15 +560,15 @@ def write_single_fwd_kernel(kernel: FmhaFwdKernel, autogen_dir: Path) -> None:
 def write_fwd_api(api_pool : FmhaFwdApiPool, autogen_dir: Path) -> None:
     (autogen_dir / FMHA_FWD_API_FILENAME).write_text(api_pool.api)
 
-def write_blobs(output_dir : Path, kernel_filter : Optional[str], receipt, mask_impl) -> None:
-    api_pool, kernels = get_fwd_blobs(kernel_filter, receipt, mask_impl)
+def write_blobs(output_dir : Path, kernel_filter : str, receipt, optdim_list, mask_impl) -> None:
+    api_pool, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl)
     for kernel in kernels:
         write_single_fwd_kernel(kernel, output_dir)
     write_fwd_api(api_pool, output_dir)
 
-def list_blobs(file_path : Path, kernel_filter : Optional[str], receipt, mask_impl) -> None:
+def list_blobs(file_path : Path, kernel_filter : str, receipt, optdim_list, mask_impl) -> None:
     with file_path.open('a') as f:
-        _, kernels = get_fwd_blobs(kernel_filter, receipt, mask_impl)
+        _, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl)
         for kernel in kernels:
             f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
         f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_API_FILENAME) + "\n")
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
index f8a89448ba..dc7ef712e2 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
@@ -181,7 +181,7 @@ class FmhaFwdAppendKVApiPool:
                                    F_pagedkv=BOOL_MAP[trait.pagedkv], F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
                                    F_rope=ROPE_MAP[trait.rope], F_bs=trait.bs, F_bsk=trait.bsk, F_bd=trait.bd, F_bdv=trait.bdv, F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype])
                 if_j = 'if' if j == 0 else 'else if'
-                per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners)
+                per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_hdim_v=hdim, F_inner_dispatch=inners)
             if_i = 'if' if i == 0 else 'else if'
             per_dtypes = per_dtypes + FMHA_FWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case)
         return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_APPENDKV_API.format(F_dispatch = per_dtypes)
@@ -323,12 +323,11 @@ def get_fwd_appendkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                                   F_tile=tile,
                                   F_pipeline=pipeline,
                                   mask_impl=mask_impl)
-                if kernel_filter != None:
+                if kernel_filter != '':
                     if not fnmatch.fnmatch(k.name, kernel_filter):
                         continue
                 # 2 - Flash attention integration
-                # 12 - Aiter(mha_fwd_kvcache) integration
-                if receipt in (2, 12):
+                if receipt == 2:
                     cond = dtype in ['fp16', 'bf16']
                     cond &= pipeline.F_vlayout == 'row'
                     if not cond:
@@ -344,13 +343,15 @@ def write_single_kernel(kernel: FmhaFwdAppendKVKernel, autogen_dir: Path) -> Non
 def write_fwd_appendkv_api(api_pool : FmhaFwdAppendKVApiPool, autogen_dir: Path) -> None:
     (autogen_dir / FMHA_FWD_APPENDKV_API_FILENAME).write_text(api_pool.api)
 
-def write_blobs(output_dir : Path, kernel_filter : Optional[str], receipt, mask_impl) -> None:
+def write_blobs(output_dir : Path, kernel_filter : Optional[str], receipt, optdim_list, mask_impl) -> None:
+    assert optdim_list == [-1]
     api_pool, kernels = get_fwd_appendkv_blobs(kernel_filter, receipt, mask_impl)
     for kernel in kernels:
         write_single_kernel(kernel, output_dir)
     write_fwd_appendkv_api(api_pool, output_dir)
 
-def list_blobs(file_path : Path, kernel_filter : Optional[str], receipt, mask_impl) -> None:
+def list_blobs(file_path : Path, kernel_filter : Optional[str], receipt, optdim_list, mask_impl) -> None:
+    assert optdim_list == [-1]
     with file_path.open('a') as f:
         _, kernels = get_fwd_appendkv_blobs(kernel_filter, receipt, mask_impl)
         for kernel in kernels:
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
index c0ca666b11..5ad118fd1a 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -91,10 +91,12 @@ using fmha_pipeline_problem = ck_tile::BlockFmhaFwdSplitKVPipelineProblem<
 using fmha_pipeline = {F_pipeline}<
     fmha_pipeline_problem>;
 
+/// FIXME: use {F_spad}/{F_dvpad} as kPadM/kPadN parameters after solving
+///        store_tile_raw() data corruption issue
 using fmha_epilogue =
     ck_tile::Default2DEpilogue<ck_tile::Default2DEpilogueProblem<typename FmhaFwdTypeConfig<{F_dtype}>::OaccDataType,
                                            typename FmhaFwdTypeConfig<{F_dtype}>::OaccDataType,
-                                           {F_spad}, {F_dvpad}>>;
+                                           false, false>>;
 
 using fmha_kernel =
     ck_tile::FmhaFwdSplitKVKernel<fmha_pipeline, fmha_epilogue>;
@@ -397,14 +399,26 @@ class FmhaFwdSplitKVPipeline:
         pn = pad_name()
         n = f'{self.tag}_v{self.F_vlayout[0]}'
         if pn != '' : n += f'_{pn}'
+        else: n += '_npad'
+
         if self.F_bias != 'no' : n += f'_{self.F_bias}'
+        else: n += '_nbias'
+
         if self.F_mask[0:2] == 's_':
             if self.F_mask == 's_mask': n += f'_mask'
+            else: n += '_nmask'
         else:
             if self.F_mask != 'no' : n += f'_m{self.F_mask[0]}'
+            else: n += '_nmask'
+
         if self.F_lse == 't' : n += '_lse'
+        else: n += '_nlse'
+
         if self.F_squant == 't' : n += '_squant'
+        else: n += '_nsquant'
+
         if self.F_pagedkv == 't' : n += '_pagedkv'
+        else: n += '_npagedkv'
         return n
 
 @dataclass
@@ -427,8 +441,13 @@ class FmhaFwdSplitKVCombinePipeline:
         pn = pad_name()
         n = f'{self.tag}'
         if pn != '' : n += f'_{pn}'
+        else: n += '_npad'
+
         if self.F_lse == 't' : n += '_lse'
+        else: n += '_nlse'
+
         if self.F_squant == 't' : n += '_squant'
+        else: n += '_nsquant'
         return n
 
 class FmhaFwdSplitKVApiPool:
@@ -464,7 +483,7 @@ class FmhaFwdSplitKVApiPool:
                                    F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
                                    F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype])
                 if_j = 'if' if j == 0 else 'else if'
-                per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners)
+                per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_hdim_v=hdim, F_inner_dispatch=inners)
             if_i = 'if' if i == 0 else 'else if'
             per_dtypes = per_dtypes + FMHA_FWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case)
         if not per_dtypes:
@@ -659,6 +678,12 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                     pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', bias, 't', squant, pagedkv, mask))
                     pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', bias, 't', squant, pagedkv, mask))
 
+                    pipelines.append(Pipeline('qr', 'row', 't', 'f', 'f', 'f', bias, 't', squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr', 'col', 't', 'f', 'f', 'f', bias, 't', squant, pagedkv, mask))
+
+                    pipelines.append(Pipeline('qr', 'row', 't', 't', 'f', 'f', bias, 't', squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr', 'col', 't', 't', 'f', 'f', bias, 't', squant, pagedkv, mask))
+
                     pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask))
                     pipelines.append(Pipeline('qr', 'col', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask))
                 else:
@@ -702,7 +727,7 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                            F_tile=tile,
                            F_pipeline=pipeline,
                            mask_impl=mask_impl)
-                if kernel_filter != None:
+                if kernel_filter != '':
                     if not fnmatch.fnmatch(k.name, kernel_filter):
                         continue
                 # Flash attention integration
@@ -714,20 +739,17 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                     if not cond:
                         continue
                 # Aiter(mha_varlen_fwd) integration
-                elif receipt == 11:
+                elif receipt == 200:
                     cond = dtype in ['fp16', 'bf16']
                     cond &= mode == "group"
                     cond &= pipeline.F_vlayout == 'row'
-                    cond &= pipeline.F_bias in ['no', 'alibi']
                     cond &= pipeline.F_squant == 'f'
                     if not cond:
                         continue
-                # Aiter(mha_fwd_kvcache) integration
-                elif receipt == 12:
+                # aiter::mha_fwd_splikv C++ api integration
+                elif receipt == 600:
                     cond = dtype in ['fp16', 'bf16']
-                    cond &= mode == "batch"
                     cond &= pipeline.F_vlayout == 'row'
-                    cond &= pipeline.F_bias in ['no', 'alibi']
                     cond &= pipeline.F_squant == 'f'
                     if not cond:
                         continue
@@ -780,9 +802,20 @@ def get_fwd_splitkv_combine_blobs(kernel_filter : Optional[str], receipt) -> Lis
                            F_mode=mode,
                            F_tile=tile,
                            F_pipeline=pipeline)
-                if kernel_filter != None:
+                if kernel_filter != '':
                     if not fnmatch.fnmatch(k.name, kernel_filter):
                         continue
+                # Aiter(mha_varlen_fwd) integration
+                if receipt == 200:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= mode == "group"
+                    if not cond:
+                        continue
+                # aiter::mha_fwd_splikv C++ api integration
+                elif receipt == 600:
+                    cond = dtype in ['fp16', 'bf16']
+                    if not cond:
+                        continue
                 gen.append(k)
 
     return gen
@@ -794,21 +827,29 @@ def write_fwd_splitkv_api(api_pool : FmhaFwdSplitKVApiPool, autogen_dir: Path) -
     file_path = autogen_dir / FMHA_FWD_SPLITKV_API_FILENAME
     file_path.write_text(api_pool.api)
 
-def write_blobs(output_dir : Path, kernel_filter : Optional[str], receipt, mask_impl) -> None:
-    kernels = get_fwd_splitkv_combine_blobs(kernel_filter, receipt)
+def write_blobs(output_dir : Path, filter_list : str, receipt, optdim_list, mask_impl) -> None:
+    filter_list = filter_list.split('@')
+    filter_list.extend([''] * (2 - len(filter_list)))
+    assert optdim_list == [-1]
+
+    kernels = get_fwd_splitkv_combine_blobs(filter_list[0], receipt)
     for kernel in kernels:
         write_single_kernel(kernel, output_dir)
-    api_pool, kernels = get_fwd_splitkv_blobs(kernel_filter, receipt, mask_impl)
+    api_pool, kernels = get_fwd_splitkv_blobs(filter_list[1], receipt, mask_impl)
     for kernel in kernels:
         write_single_kernel(kernel, output_dir)
     write_fwd_splitkv_api(api_pool, output_dir)
 
-def list_blobs(file_path : Path, kernel_filter : Optional[str], receipt, mask_impl) -> None:
+def list_blobs(file_path : Path, filter_list : str, receipt, optdim_list, mask_impl) -> None:
+    filter_list = filter_list.split('@')
+    filter_list.extend([''] * (2 - len(filter_list)))
+    assert optdim_list == [-1]
+
     with file_path.open('a') as f:
-        kernels = get_fwd_splitkv_combine_blobs(kernel_filter, receipt)
+        kernels = get_fwd_splitkv_combine_blobs(filter_list[0], receipt)
         for kernel in kernels:
             f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
-        _, kernels = get_fwd_splitkv_blobs(kernel_filter, receipt, mask_impl)
+        _, kernels = get_fwd_splitkv_blobs(filter_list[1], receipt, mask_impl)
         for kernel in kernels:
             f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
         f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_SPLITKV_API_FILENAME) + "\n")
diff --git a/example/ck_tile/01_fmha/fmha_bwd.hpp b/example/ck_tile/01_fmha/fmha_bwd.hpp
index 6204cbcfa8..9179dbd9be 100644
--- a/example/ck_tile/01_fmha/fmha_bwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.hpp
@@ -452,4 +452,5 @@ struct fmha_bwd_traits
     bool is_deterministic;
     // TODO: padding check is inside this api
 };
+template <int Version = 2>
 float fmha_bwd(fmha_bwd_traits, fmha_bwd_args, const ck_tile::stream_config&);
diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp
index b3855e59df..8f6fb8df54 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -620,7 +620,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
             : std::array<ck_tile::index_t, 4>{1, 1, 1, 1} /* dummy shape for simplifying code */);
     ck_tile::HostTensor<BiasDataType> bias_host(
         bias.type == bias_enum::elementwise_bias
-            ? get_lengths(i_perm, 1, 1, shape_seqlen_q, shape_seqlen_k)
+            ? get_lengths(i_perm, 1, 1, shape_seqlen_q, max_seqlen_k)
             : std::array<ck_tile::index_t, 4>{1, 1, 1, 1} /* dummy shape for simplifying code */);
 
     ck_tile::HostTensor<SaccDataType> alibi_slope_host(
@@ -884,7 +884,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
             else
                 return i_perm ? seqlen_knew : nhead_k * seqlen_knew;
         }();
-        const ck_tile::index_t stride_bias    = (i_perm ? shape_seqlen_k : 1 * shape_seqlen_k);
+        const ck_tile::index_t stride_bias    = (i_perm ? max_seqlen_k : 1 * max_seqlen_k);
         const ck_tile::index_t stride_randval = (max_seqlen_k);
         const ck_tile::index_t stride_o_acc   = (hdim_v);
         const ck_tile::index_t stride_o       = (o_perm ? hdim_v : nhead * hdim_v);
@@ -909,7 +909,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                 return i_perm ? hdim_v * seqlen_knew : seqlen_knew;
         }();
         const ck_tile::index_t nhead_stride_bias =
-            (i_perm ? 0 * shape_seqlen_q * shape_seqlen_k : 0 * shape_seqlen_k);
+            (i_perm ? 0 * shape_seqlen_q * max_seqlen_k : 0 * max_seqlen_k);
         const ck_tile::index_t nhead_stride_randval = (shape_seqlen_q * max_seqlen_k);
         const ck_tile::index_t nhead_stride_lse     = shape_seqlen_q;
         const ck_tile::index_t nhead_stride_lse_acc = (num_splits * shape_seqlen_q);
@@ -925,7 +925,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
             (0 < page_block_size ? (nhead_k * hdim_v * page_block_size)
                                  : (nhead_k * hdim_v * shape_seqlen_k));
         const ck_tile::index_t batch_stride_vnew    = (nhead_k * hdim_v * seqlen_knew);
-        const ck_tile::index_t batch_stride_bias    = (0 * nhead * shape_seqlen_q * shape_seqlen_k);
+        const ck_tile::index_t batch_stride_bias    = (0 * nhead * shape_seqlen_q * max_seqlen_k);
         const ck_tile::index_t batch_stride_randval = (nhead * shape_seqlen_q * max_seqlen_k);
         const ck_tile::index_t batch_stride_lse     = (nhead * shape_seqlen_q);
         const ck_tile::index_t batch_stride_lse_acc = (nhead * num_splits * shape_seqlen_q);
@@ -1381,9 +1381,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
             ck_tile::HostTensor<BiasDataType> bias_host_ref({1, real_seqlen_q, real_seqlen_k});
             // clang-format off
             if(i_perm)
-                bias_host_ref.ForEach([&](auto& self, auto i) { self(i) = bias_host(0, 0, i[1] + query_offset, i[2] + key_offset); });
+                bias_host_ref.ForEach([&](auto& self, auto i) { self(i) = bias_host(0, 0, i[1] + query_offset, i[2]); });
             else
-                bias_host_ref.ForEach([&](auto& self, auto i) { self(i) = bias_host(0, i[1] + query_offset, 0, i[2] + key_offset); });
+                bias_host_ref.ForEach([&](auto& self, auto i) { self(i) = bias_host(0, i[1] + query_offset, 0, i[2]); });
             // clang-format on
 
             // broadcast from [1, real_seqlen_q, real_seqlen_k] to [nhead, real_seqlen_q,
diff --git a/example/ck_tile/01_fmha/generate.py b/example/ck_tile/01_fmha/generate.py
index 0c2cef1ce7..c2b0924eb3 100644
--- a/example/ck_tile/01_fmha/generate.py
+++ b/example/ck_tile/01_fmha/generate.py
@@ -30,7 +30,7 @@ handlers = dict(
 )
 assert 0 < len(handlers)
 
-def write_blobs(output_dir: Optional[str], api_list : List[str], kernel_filter : Optional[str], receipt, mask_impl) -> None:
+def write_blobs(output_dir: Optional[str], api_list : List[str], filters_list : List[str], optdim_list : List[int], receipt, mask_impl) -> None:
     if output_dir is None:
         output_dir = Path(__file__).parent
     else:
@@ -38,21 +38,21 @@ def write_blobs(output_dir: Optional[str], api_list : List[str], kernel_filter :
 
     output_dir.mkdir(parents=True, exist_ok=True)
 
-    for api in api_list:
+    for api, kernel_filter in zip(api_list, filters_list):
         handler = handlers[api][HandlerId.WRITE_BLOBS]
-        handler(output_dir, kernel_filter, receipt, mask_impl)
+        handler(output_dir, kernel_filter, receipt, optdim_list, mask_impl)
 
 # list all the files that will be generated
-def list_blobs(output_file : Optional[str], api_list : List[str], kernel_filter : Optional[str], receipt, mask_impl) -> None:
+def list_blobs(output_file : Optional[str], api_list : List[str], filters_list : List[str], optdim_list : List[int], receipt, mask_impl) -> None:
     assert output_file is not None
     file_path = Path(output_file)
 
     # create an empty file / drop its contents if it exists
     open(file_path, "w").close()
 
-    for api in api_list:
+    for api, kernel_filter in zip(api_list, filters_list):
         handler = handlers[api][HandlerId.LIST_BLOBS]
-        handler(file_path, kernel_filter, receipt, mask_impl)
+        handler(file_path, kernel_filter, receipt, optdim_list, mask_impl)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
@@ -84,6 +84,7 @@ if __name__ == "__main__":
     parser.add_argument(
         "-f",
         "--filter",
+        default='',
         required=False,
         help="filter out kernels that need to generate, using fnmatch module"
     )
@@ -105,15 +106,31 @@ if __name__ == "__main__":
              "  1: generate more instance to cover all hdim\n"  + \
              "  2: Only generate instance for Flash attention integration\n"  + \
              "  4: Only generate instance for PyTorch integration\n" + \
-             " 10: Only generate instance for Aiter(mha_fwd, mha_bwd) integration\n" + \
-             " 11: Only generate instance for Aiter(mha_varlen_fwd, mha_varlen_bwd) integration\n" + \
-             " 12: Only generate instance for Aiter(mha_fwd_kvcache) integration"
-      
+             "  100-199: Only generate instance for Aiter(mha_fwd) integration\n" + \
+             "  200-299: Only generate instance for Aiter(mha_varlen_fwd) integration\n" + \
+             "  300-399: Only generate instance for Aiter(mha_bwd) integration\n" + \
+             "  400-499: Only generate instance for Aiter(mha_varlen_bwd) integration\n" + \
+             "  600-699: Only generate instance for aiter::mha_fwd && aiter::mha_fwd_splitkv && aiter::mha_bwd C++ api integration"
+    )
+
+    parser.add_argument(
+        "--optdim",
+        default='-1',
+        required=False,
+        help="only optimize the hdim in the list. separated by comma. -1 is the default choice" + \
+              "eg. --optdim=32,64,128,256"
     )
 
     args = parser.parse_args()
     api_list = args.direction.split(',')
+    filter_list = args.filter.split(',')
+    filter_list.extend([''] * (len(api_list) - len(filter_list)))
+    optdim_list = [int(hdim) for hdim in args.optdim.split(',')]
+
+    if len(api_list) > 1:
+        assert optdim_list == [-1]
+
     if args.list_blobs is not None:
-        list_blobs(args.list_blobs, api_list, args.filter, int(args.receipt), mask_impl=args.mask)
+        list_blobs(args.list_blobs, api_list, filter_list, optdim_list, int(args.receipt), mask_impl=args.mask)
     else:
-        write_blobs(args.output_dir, api_list, args.filter, int(args.receipt), mask_impl=args.mask)
+        write_blobs(args.output_dir, api_list, filter_list, optdim_list, int(args.receipt), mask_impl=args.mask)
diff --git a/example/ck_tile/02_layernorm2d/generate.py b/example/ck_tile/02_layernorm2d/generate.py
index 700b007fad..0238a125dc 100644
--- a/example/ck_tile/02_layernorm2d/generate.py
+++ b/example/ck_tile/02_layernorm2d/generate.py
@@ -564,9 +564,9 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
                                   h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 512, 4,  True,  False, True, True,   False,   0,    0,    0),
                                   h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 2,  True,  False, True, True,   False,   0,    0,    0),
                                   h_traits('x', 'y', 'xs', 'ys', 1,  8,  1,1024, 1,  True,  False, True, True,   False,   0,    0,    0)],
-                        'big'  :[ h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 8,  True,  False, True, True,    True,   0,    0,    0),
+                        'big'  :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  1,1024, 8,  True,  False, True, True,    True,   0,    0,    0),
                                   h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 4,  True,  False, True, True,    True,   0,    0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1,1024, 2,  True,  False, True, True,    True,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1, 12,  1, 256, 2,  True,  False, True, True,    True,   0,    0,    0),
                                   h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 1,  True,  False, True, True,    True,   0,    0,    0)]}
         total_blob = list()
         for hs_key in h_trait_dict:
diff --git a/example/ck_tile/03_gemm/CMakeLists.txt b/example/ck_tile/03_gemm/CMakeLists.txt
index 30cfee22f6..411db2e317 100644
--- a/example/ck_tile/03_gemm/CMakeLists.txt
+++ b/example/ck_tile/03_gemm/CMakeLists.txt
@@ -1,5 +1,9 @@
 add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp)
 add_executable(tile_example_gemm_universal EXCLUDE_FROM_ALL universal_gemm.cpp)
-target_compile_options(tile_example_gemm_universal PRIVATE
-  -mllvm -enable-noalias-to-md-conversion=0
-)
+set(EXAMPLE_GEMM_COMPILE_OPTIONS)
+if(CK_USE_OCP_FP8)
+  list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
+endif()
+list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
+target_compile_options(tile_example_gemm_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+target_compile_options(tile_example_gemm_universal PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp
index 5dc7b9cd0b..69051423fb 100644
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -10,7 +10,7 @@
 #include <tuple>
 
 #include "ck_tile/host.hpp"
-#include "gemm_basic.hpp"
+#include "gemm_utils.hpp"
 
 template <typename ADataType,
           typename BDataType,
@@ -29,8 +29,8 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
     constexpr int kBlockPerCu = 1;
 
     // This part comes from the Codegen
-    constexpr ck_tile::index_t M_Tile = 128;
-    constexpr ck_tile::index_t N_Tile = 128;
+    constexpr ck_tile::index_t M_Tile = 256;
+    constexpr ck_tile::index_t N_Tile = 256;
     constexpr ck_tile::index_t K_Tile = 64;
 
     constexpr ck_tile::index_t M_Warp = 2;
@@ -54,7 +54,9 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
         GemmPipelineProblem<ADataType, BDataType, AccDataType, CodegenGemmShape, CodegenGemmTraits>;
     using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
     using GemmEpilogue        = ck_tile::CShuffleEpilogue<
-        ck_tile::CShuffleEpilogueProblem<AccDataType,
+        ck_tile::CShuffleEpilogueProblem<ADataType,
+                                         BDataType,
+                                         AccDataType,
                                          CDataType,
                                          CLayout,
                                          CodegenPipelineProblem::kBlockSize,
@@ -99,45 +101,99 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
 
 #include "run_gemm_example.inc"
 
+template <typename APrecType, typename BPrecType = APrecType, typename CPrecType = APrecType>
+int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+{
+    using Row = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+    if constexpr(std::is_same_v<BPrecType, ck_tile::pk_int4_t>)
+    {
+        if(a_layout == "R" && b_layout == "C")
+        {
+            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
+                argc, argv, Row{}, Col{}, Row{});
+        }
+        else if(a_layout == "C" && b_layout == "C")
+        {
+            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
+                argc, argv, Col{}, Col{}, Row{});
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported memory layout for the input matrices when "
+                                     "BPrecType is ck_tile::pk_int4_t!");
+        }
+    }
+    else
+    {
+        if(a_layout == "R" && b_layout == "R")
+        {
+            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
+                argc, argv, Row{}, Row{}, Row{});
+        }
+        else if(a_layout == "R" && b_layout == "C")
+        {
+            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
+                argc, argv, Row{}, Col{}, Row{});
+        }
+        else if(a_layout == "C" && b_layout == "R")
+        {
+            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
+                argc, argv, Col{}, Row{}, Row{});
+        }
+        else if(a_layout == "C" && b_layout == "C")
+        {
+            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
+                argc, argv, Col{}, Col{}, Row{});
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported memory layout for the input matrices!");
+        }
+    }
+}
+
 int run_gemm_example(int argc, char* argv[])
 {
     auto [result, arg_parser] = create_args(argc, argv);
     if(!result)
         return -1;
 
-    using Row = ck_tile::tensor_layout::gemm::RowMajor;
-    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
-
     std::string data_type = arg_parser.get_str("prec");
     std::string a_layout  = arg_parser.get_str("a_layout");
     std::string b_layout  = arg_parser.get_str("b_layout");
 
-    if(a_layout == "R" && b_layout == "C")
+    if(data_type == "fp16")
     {
-        if(data_type == "fp16")
-        {
-            return run_gemm_example_with_layouts<ck_tile::half_t>(argc, argv, Row{}, Col{}, Row{});
-        }
-        else if(data_type == "bf16")
-        {
-            return run_gemm_example_with_layouts<ck_tile::bf16_t>(argc, argv, Row{}, Col{}, Row{});
-        }
-        else if(data_type == "fp8")
-        {
-            return run_gemm_example_with_layouts<ck_tile::fp8_t>(argc, argv, Row{}, Col{}, Row{});
-        }
-        else if(data_type == "bf8")
-        {
-            return run_gemm_example_with_layouts<ck_tile::bf8_t>(argc, argv, Row{}, Col{}, Row{});
-        }
-        else
-        {
-            throw std::runtime_error("Unsupported data_type!");
-        }
+        return run_gemm_example_prec_type<ck_tile::half_t>(a_layout, b_layout, argc, argv);
     }
+    else if(data_type == "bf16")
+    {
+        return run_gemm_example_prec_type<ck_tile::bf16_t>(a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "fp8")
+    {
+        return run_gemm_example_prec_type<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "bf8")
+    {
+        return run_gemm_example_prec_type<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>(
+            a_layout, b_layout, argc, argv);
+    }
+
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
+    else if(data_type == "pk_int4_t")
+    {
+        // TODO: Add support for bhalf_t ADataType
+        return run_gemm_example_prec_type<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>(
+            a_layout, b_layout, argc, argv);
+    }
+#endif
     else
     {
-        throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!");
+        throw std::runtime_error("Unsupported data type for this operation !!!");
     }
 }
 
diff --git a/example/ck_tile/03_gemm/gemm_basic.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
similarity index 60%
rename from example/ck_tile/03_gemm/gemm_basic.hpp
rename to example/ck_tile/03_gemm/gemm_utils.hpp
index dbc582e5a3..25fab6bde0 100644
--- a/example/ck_tile/03_gemm/gemm_basic.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -35,11 +35,77 @@
 #error "unsupported CK_TILE_PIPELINE_DEFAULT value"
 #endif
 
+struct GemmConfig
+{
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
+    // Memory friendly for Interwave scheduler
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 32;
+    static constexpr ck_tile::index_t K_Tile = 64;
+
+    static constexpr ck_tile::index_t M_Warp = 4;
+    static constexpr ck_tile::index_t N_Warp = 1;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = 8;
+
+    static constexpr bool DoubleSmemBuffer = false;
+#endif
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
+    // Compute friendly for Intrawave scheduler
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128;
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = 32;
+
+    static constexpr bool DoubleSmemBuffer = false;
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
+    // Compute friendly for Intrawave scheduler
+    // Using the ping pong reader in the lds level
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 32;
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    static constexpr bool DoubleSmemBuffer = true;
+#endif
+
+    static constexpr bool kPadM = false;
+    static constexpr bool kPadN = false;
+    static constexpr bool kPadK = false;
+
+    static constexpr bool PermuteA = false;
+    static constexpr bool PermuteB = false;
+
+    static constexpr bool TransposeC            = false;
+    static constexpr bool UseStructuredSparsity = false;
+
+    static constexpr int kBlockPerCu                         = 1;
+    static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+    static constexpr ck_tile::index_t TileParitionerM01      = 4;
+};
+
 template <typename ADataType, typename BDataType = ADataType, typename CDataType = ADataType>
-struct GemmBasicTypeConfig;
+struct GemmTypeConfig;
 
 template <>
-struct GemmBasicTypeConfig<ck_tile::half_t>
+struct GemmTypeConfig<ck_tile::half_t>
 {
     using ADataType   = ck_tile::half_t;
     using BDataType   = ck_tile::half_t;
@@ -49,7 +115,7 @@ struct GemmBasicTypeConfig<ck_tile::half_t>
 };
 
 template <>
-struct GemmBasicTypeConfig<ck_tile::bf16_t>
+struct GemmTypeConfig<ck_tile::bf16_t, ck_tile::bf16_t, ck_tile::bf16_t>
 {
     using ADataType   = ck_tile::bf16_t;
     using BDataType   = ck_tile::bf16_t;
@@ -58,7 +124,7 @@ struct GemmBasicTypeConfig<ck_tile::bf16_t>
 };
 
 template <>
-struct GemmBasicTypeConfig<ck_tile::fp8_t>
+struct GemmTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>
 {
     using ADataType   = ck_tile::fp8_t;
     using BDataType   = ck_tile::fp8_t;
@@ -67,7 +133,7 @@ struct GemmBasicTypeConfig<ck_tile::fp8_t>
 };
 
 template <>
-struct GemmBasicTypeConfig<ck_tile::bf8_t>
+struct GemmTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>
 {
     using ADataType   = ck_tile::bf8_t;
     using BDataType   = ck_tile::bf8_t;
@@ -76,7 +142,7 @@ struct GemmBasicTypeConfig<ck_tile::bf8_t>
 };
 
 template <>
-struct GemmBasicTypeConfig<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>
+struct GemmTypeConfig<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>
 {
     using ADataType   = ck_tile::half_t;
     using BDataType   = ck_tile::pk_int4_t;
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index f068cbc1da..79ed9ce76b 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -29,8 +29,68 @@ auto calculate_rtol_atol(const ck_tile::index_t K,
     // Use higher threshold
     return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
 }
-template <typename Tensor>
+
+template <typename Tensor,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
 void permute_tensor_b(Tensor& tensor)
+{
+    using GemmShape = ck_tile::TileGemmShape<
+        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+        ck_tile::
+            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
+        GemmConfig::PermuteA,
+        GemmConfig::PermuteB>;
+
+    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
+                                                                 GemmConfig::kPadN,
+                                                                 GemmConfig::kPadK,
+                                                                 GemmConfig::DoubleSmemBuffer,
+                                                                 ALayout,
+                                                                 BLayout,
+                                                                 CLayout,
+                                                                 GemmConfig::TransposeC,
+                                                                 GemmConfig::UseStructuredSparsity>;
+
+    using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                       BDataType,
+                                                                       AccDataType,
+                                                                       GemmShape,
+                                                                       GemmUniversalTraits,
+                                                                       GEMM_PIPELINE_SCHEDULER,
+                                                                       true,
+                                                                       ck_tile::TailNumber::Full>;
+
+    using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
+
+    const ck_tile::index_t K  = tensor.get_length(0);
+    const ck_tile::index_t N  = tensor.get_length(1);
+    const ck_tile::index_t K1 = GemmPipeline::GetSmemPackB();
+    const ck_tile::index_t K0 = K / K1;
+
+    Tensor tensor_copy = tensor;
+
+    // int K0, N, K1
+    for(int j = 0; j < K0; j++)
+    {
+        for(int i = 0; i < N; i++)
+        {
+            for(int jj = 0; jj < K1; jj++)
+            {
+                tensor(j * N * K1 + i * K1 + jj) = tensor_copy(i * K + (j * K1 + jj));
+            }
+        }
+    }
+}
+
+template <typename Tensor>
+void permute_vectors_i4x4_b(Tensor& tensor)
 {
     const ck_tile::index_t K = tensor.get_length(0);
     const ck_tile::index_t N = tensor.get_length(1);
@@ -126,13 +186,15 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
     float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
     float gb_per_sec = num_byte / 1.E6 / ave_time;
 
-    std::cout << "Run Gemm kernel with M =" << M << " N =" << N << " K =" << K
-              << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C
-              << " A_Layout =" << ALayout::name << " B_Layout =" << BLayout::name
-              << " C_Layout =" << CLayout::name << " A Type = " << DataTypeTraits<ADataType>::name
-              << " B Type = " << DataTypeTraits<BDataType>::name
-              << " C Type = " << DataTypeTraits<CDataType>::name << " : " << ave_time << " ms, "
-              << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
+    std::cout << "Run Gemm kernel with M=" << M << " N=" << N << " K=" << K
+              << " StrideA=" << stride_A << " StrideB=" << stride_B << " StrideC=" << stride_C
+              << " A_Layout=" << ALayout::name << " B_Layout =" << BLayout::name
+              << " C_Layout=" << CLayout::name << " A_Type=" << DataTypeTraits<ADataType>::name
+              << " B_Type=" << DataTypeTraits<BDataType>::name
+              << " C_Type=" << DataTypeTraits<CDataType>::name
+              << " StructuredSparsity=" << (GemmConfig::UseStructuredSparsity ? "on" : "off")
+              << " : " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << std::endl;
 
     return ave_time;
 }
@@ -153,7 +215,7 @@ int run_gemm_example_with_layouts(int argc,
     if(!result)
         return -1;
 
-    using AccDataType = typename GemmBasicTypeConfig<ADataType, BDataType, CDataType>::AccDataType;
+    using AccDataType = typename GemmTypeConfig<ADataType, BDataType, CDataType>::AccDataType;
 
     ck_tile::index_t M = arg_parser.get_int("m");
     ck_tile::index_t N = arg_parser.get_int("n");
@@ -191,8 +253,8 @@ int run_gemm_example_with_layouts(int argc,
     }
     else if(init_method == 2)
     {
-        ck_tile::FillConstant<ADataType>{static_cast<ADataType>(1)}(a_m_k);
-        ck_tile::FillConstant<BDataType>{static_cast<BDataType>(1)}(b_k_n);
+        ck_tile::FillUniformDistribution<ADataType>{1.f, 1.f}(a_m_k);
+        ck_tile::FillUniformDistribution<BDataType>{1.f, 1.f}(b_k_n);
     }
     else
     {
@@ -200,22 +262,45 @@ int run_gemm_example_with_layouts(int argc,
         b_k_n.SetZero();
     }
 
+    if(GemmConfig::UseStructuredSparsity)
+    {
+        ck_tile::AdjustToStructuredSparsity<ADataType>{}(a_m_k);
+    }
+
     ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
     ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
     ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
 
-    a_m_k_dev_buf.ToDevice(a_m_k.data());
+    static_assert(!GemmConfig::PermuteA, "Not implemented");
     if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
     {
-        // Permute data for device implementation
+        // Permute vector pk_i4x4 data for device implementation
         ck_tile::HostTensor<BDataType> b_k_n_dev = b_k_n;
-        permute_tensor_b(b_k_n_dev);
+        if constexpr(GemmConfig::PermuteB)
+        {
+            permute_tensor_b<decltype(b_k_n_dev),
+                             ADataType,
+                             BDataType,
+                             AccDataType,
+                             CDataType,
+                             ALayout,
+                             BLayout,
+                             CLayout>(b_k_n_dev);
+        }
+        permute_vectors_i4x4_b(b_k_n_dev);
         b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
     }
     else
     {
+        if constexpr(GemmConfig::PermuteB)
+        {
+            std::cout << "Permute for this DataType is not implemented." << std::endl;
+            return false;
+        }
         b_k_n_dev_buf.ToDevice(b_k_n.data());
     }
+
+    a_m_k_dev_buf.ToDevice(a_m_k.data());
     c_m_n_dev_buf.SetZero();
     c_m_n_dev_result.SetZero();
 
@@ -317,7 +402,6 @@ int run_gemm_example_with_layouts(int argc,
                                   "Error: Incorrect results!",
                                   rtol_atol.at(ck_tile::number<0>{}),
                                   rtol_atol.at(ck_tile::number<1>{}));
-
         std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
                   << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
                   << std::endl;
diff --git a/example/ck_tile/03_gemm/script/benchmark_basic_bf16.sh b/example/ck_tile/03_gemm/script/benchmark_basic_bf16.sh
old mode 100644
new mode 100755
index e69de29bb2..d7e5d4640a
--- a/example/ck_tile/03_gemm/script/benchmark_basic_bf16.sh
+++ b/example/ck_tile/03_gemm/script/benchmark_basic_bf16.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+EXE="$(find . -name tile_example_gemm_basic -type f | head -n 1)"
+VALID=1
+
+
+for b_matrix_layout in "C"; do
+    for m in "64" "512" "1024" "2048"; do
+        for n in "512" "1024" "2048"; do
+            for k in "64" "512" "1024" "2048"; do
+                $EXE -prec=bf16 -m=$m -n=$n -k=$k -a_layout="R" -b_layout="$b_matrix_layout" -c_layout="R" -v=$VALID
+            done
+        done
+    done
+done
\ No newline at end of file
diff --git a/example/ck_tile/03_gemm/script/benchmark_basic_bf8.sh b/example/ck_tile/03_gemm/script/benchmark_basic_bf8.sh
old mode 100644
new mode 100755
index e69de29bb2..466f6bb4e1
--- a/example/ck_tile/03_gemm/script/benchmark_basic_bf8.sh
+++ b/example/ck_tile/03_gemm/script/benchmark_basic_bf8.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+EXE="$(find . -name tile_example_gemm_basic -type f | head -n 1)"
+VALID=1
+
+
+for b_matrix_layout in "C"; do
+    for m in "64" "512" "1024" "2048"; do
+        for n in "512" "1024" "2048"; do
+            for k in "64" "512" "1024" "2048"; do
+                $EXE -prec=bf8 -m=$m -n=$n -k=$k -a_layout="R" -b_layout="$b_matrix_layout" -c_layout="R" -v=$VALID
+            done
+        done
+    done
+done
\ No newline at end of file
diff --git a/example/ck_tile/03_gemm/script/benchmark_basic.sh b/example/ck_tile/03_gemm/script/benchmark_basic_fp16.sh
similarity index 100%
rename from example/ck_tile/03_gemm/script/benchmark_basic.sh
rename to example/ck_tile/03_gemm/script/benchmark_basic_fp16.sh
diff --git a/example/ck_tile/03_gemm/script/benchmark_basic_fp8.sh b/example/ck_tile/03_gemm/script/benchmark_basic_fp8.sh
old mode 100644
new mode 100755
diff --git a/example/ck_tile/03_gemm/script/benchmark_mem_pipeline_bf16.sh b/example/ck_tile/03_gemm/script/benchmark_mem_pipeline_bf16.sh
old mode 100644
new mode 100755
diff --git a/example/ck_tile/03_gemm/script/benchmark_mem_pipeline_bf8.sh b/example/ck_tile/03_gemm/script/benchmark_mem_pipeline_bf8.sh
old mode 100644
new mode 100755
diff --git a/example/ck_tile/03_gemm/script/benchmark_mem_pipeline.sh b/example/ck_tile/03_gemm/script/benchmark_mem_pipeline_fp16.sh
similarity index 100%
rename from example/ck_tile/03_gemm/script/benchmark_mem_pipeline.sh
rename to example/ck_tile/03_gemm/script/benchmark_mem_pipeline_fp16.sh
diff --git a/example/ck_tile/03_gemm/script/benchmark_mem_pipeline_fp8.sh b/example/ck_tile/03_gemm/script/benchmark_mem_pipeline_fp8.sh
old mode 100644
new mode 100755
diff --git a/example/ck_tile/03_gemm/script/run_full_test.sh b/example/ck_tile/03_gemm/script/run_full_test.sh
index 45bd1bed61..12ea6f0bf8 100755
--- a/example/ck_tile/03_gemm/script/run_full_test.sh
+++ b/example/ck_tile/03_gemm/script/run_full_test.sh
@@ -32,14 +32,14 @@ function print_log_header(){
 }
 
 # run verification tests
-example/ck_tile/03_gemm/script/smoke_test_basic.sh
+for dtype in fp16 bf16 fp8 bf8; do
+    example/ck_tile/03_gemm/script/benchmark_basic_$dtype.sh
+done
 example/ck_tile/03_gemm/script/smoke_test_mem_pipeline.sh
 
 # run performance benchmarks
-export gemm_basic_log="perf_tile_gemm_basic_fp16_$GPU_arch.log"
-print_log_header $gemm_basic_log $env_type $branch $host_name
-example/ck_tile/03_gemm/script/benchmark_basic.sh 2>&1 | tee -a $gemm_basic_log
-
-export gemm_mem_pipeline_log="perf_tile_gemm_mem_pipeline_fp16_$GPU_arch.log"
-print_log_header $gemm_mem_pipeline_log $env_type $branch $host_name
-example/ck_tile/03_gemm/script/benchmark_mem_pipeline.sh 2>&1 | tee -a $gemm_mem_pipeline_log
+for dtype in fp16 bf16 fp8 bf8; do
+    export gemm_log="perf_tile_gemm_mem_pipeline_${dtype}_${GPU_arch}.log"
+    print_log_header $gemm_log $env_type $branch $host_name
+    example/ck_tile/03_gemm/script/benchmark_mem_pipeline_$dtype.sh 2>&1 | tee -a $gemm_log
+done
diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index ab763437e5..2ba16ca89d 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -10,7 +10,7 @@
 #include <tuple>
 
 #include "ck_tile/host.hpp"
-#include "gemm_basic.hpp"
+#include "gemm_utils.hpp"
 
 template <typename ADataType,
           typename BDataType,
@@ -21,90 +21,40 @@ template <typename ADataType,
           typename CLayout>
 float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
 {
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-    // Memory friendly for Interwave scheduler
-    constexpr ck_tile::index_t M_Tile = 128;
-    constexpr ck_tile::index_t N_Tile = 32;
-    constexpr ck_tile::index_t K_Tile = 64;
+    using GemmShape = ck_tile::TileGemmShape<
+        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+        ck_tile::
+            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
+        GemmConfig::PermuteA,
+        GemmConfig::PermuteB>;
+    using TilePartitioner =
+        ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
+                                                   GemmConfig::TileParitionerGroupNum,
+                                                   GemmConfig::TileParitionerM01>;
 
-    constexpr ck_tile::index_t M_Warp = 4;
-    constexpr ck_tile::index_t N_Warp = 1;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 8;
-
-    constexpr bool DoubleSmemBuffer = false;
-#endif
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-    // Compute friendly for Intrawave scheduler
-    constexpr ck_tile::index_t M_Tile = 256;
-    constexpr ck_tile::index_t N_Tile = 256;
-    constexpr ck_tile::index_t K_Tile = 64;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    constexpr bool DoubleSmemBuffer = false;
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
-    // Compute friendly for Intrawave scheduler
-    // Using the ping pong reader in the lds level
-    constexpr ck_tile::index_t M_Tile = 256;
-    constexpr ck_tile::index_t N_Tile = 256;
-    constexpr ck_tile::index_t K_Tile = 32;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    constexpr bool DoubleSmemBuffer = true;
-#endif
-
-    constexpr bool kPadM = false;
-    constexpr bool kPadN = false;
-    constexpr bool kPadK = false;
-
-    constexpr bool TransposeC = false;
-
-    constexpr int kBlockPerCu                         = 1;
-    constexpr ck_tile::index_t TileParitionerGroupNum = 8;
-    constexpr ck_tile::index_t TileParitionerM01      = 4;
-
-    // ===============================================
-
-    using GemmShape =
-        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
-                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
-                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
-    using TilePartitioner = ck_tile::
-        GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
-
-    using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
-    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<kPadM,
-                                                                 kPadN,
-                                                                 kPadK,
-                                                                 DoubleSmemBuffer,
+    using Traits              = ck_tile::TileGemmTraits<GemmConfig::kPadM,
+                                           GemmConfig::kPadN,
+                                           GemmConfig::kPadK,
+                                           ALayout,
+                                           BLayout,
+                                           CLayout>;
+    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
+                                                                 GemmConfig::kPadN,
+                                                                 GemmConfig::kPadK,
+                                                                 GemmConfig::DoubleSmemBuffer,
                                                                  ALayout,
                                                                  BLayout,
                                                                  CLayout,
-                                                                 TransposeC>;
+                                                                 GemmConfig::TransposeC,
+                                                                 GemmConfig::UseStructuredSparsity>;
     using GemmPipelineProblem =
         ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
 
     using BaseGemmPipeline = UNIVERSAL_GEMM_PIPELINE<GemmPipelineProblem>;
 
-    const ck_tile::index_t k_grain     = args.k_batch * K_Tile;
-    const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * K_Tile;
+    const ck_tile::index_t k_grain     = args.k_batch * GemmConfig::K_Tile;
+    const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * GemmConfig::K_Tile;
     const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
     const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
     const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
@@ -127,17 +77,19 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
 
         using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
         using GemmEpilogue = ck_tile::CShuffleEpilogue<
-            ck_tile::CShuffleEpilogueProblem<AccDataType,
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             AccDataType,
                                              CDataType,
                                              CLayout,
                                              GemmPipelineProblem::kBlockSize,
                                              TilePartitioner::MPerBlock,
                                              TilePartitioner::NPerBlock,
-                                             M_Warp,
-                                             N_Warp,
-                                             M_Warp_Tile,
-                                             N_Warp_Tile,
-                                             K_Warp_Tile,
+                                             GemmConfig::M_Warp,
+                                             GemmConfig::N_Warp,
+                                             GemmConfig::M_Warp_Tile,
+                                             GemmConfig::N_Warp_Tile,
+                                             GemmConfig::K_Warp_Tile,
                                              UniversalGemmProblem::TransposeC>>;
         using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
         auto kargs   = Kernel::MakeKernelArgs(args);
@@ -158,8 +110,9 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
                       << std::endl;
         }
 
-        ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        ave_time = ck_tile::launch_kernel(s,
+                                          ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                                              Kernel{}, grids, blocks, 0, kargs));
         return ave_time;
     };
 
@@ -171,6 +124,16 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
             Run(ck_tile::bool_constant<true>{},
                 ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
         }
+        else if(tail_num == ck_tile::TailNumber::Odd)
+        {
+            Run(ck_tile::bool_constant<true>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+        }
+        else if(tail_num == ck_tile::TailNumber::Even)
+        {
+            Run(ck_tile::bool_constant<true>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
+        }
         else
         {
             std::ostringstream err;
@@ -255,11 +218,29 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
     }
     else
     {
-        std::ostringstream err;
-        err << "Num K loop must be larger than number of prefetech stages."
-            << "\n PrefetchStages: " << BaseGemmPipeline::PrefetchStages << "\n File: " << __FILE__
-            << ":" << __LINE__ << ", in function: " << __func__;
-        throw std::runtime_error(err.str());
+        if(tail_num == ck_tile::TailNumber::Full)
+        {
+            Run(ck_tile::bool_constant<false>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+        }
+        else if(tail_num == ck_tile::TailNumber::Odd)
+        {
+            Run(ck_tile::bool_constant<false>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+        }
+        else if(tail_num == ck_tile::TailNumber::Even)
+        {
+            Run(ck_tile::bool_constant<false>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+        }
+        else
+        {
+            std::ostringstream err;
+            err << "Num K loop must be larger than number of prefetech stages."
+                << "\n PrefetchStages: " << BaseGemmPipeline::PrefetchStages
+                << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
+            throw std::runtime_error(err.str());
+        }
     }
 
     return ave_time;
@@ -267,133 +248,113 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
 
 #include "run_gemm_example.inc"
 
+template <typename APrecType, typename BPrecType = APrecType, typename CPrecType = APrecType>
+int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+{
+    using Row = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+    if constexpr(std::is_same_v<BPrecType, ck_tile::pk_int4_t>)
+    {
+        if(a_layout == "R" && b_layout == "C")
+        {
+            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
+                argc, argv, Row{}, Col{}, Row{});
+        }
+        else if(a_layout == "C" && b_layout == "C")
+        {
+            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
+                argc, argv, Col{}, Col{}, Row{});
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported memory layout for the input matrices when "
+                                     "BPrecType is ck_tile::pk_int4_t!");
+        }
+    }
+    else
+    {
+        if(a_layout == "R" && b_layout == "R")
+        {
+            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
+                argc, argv, Row{}, Row{}, Row{});
+        }
+        else if(a_layout == "R" && b_layout == "C")
+        {
+            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
+                argc, argv, Row{}, Col{}, Row{});
+        }
+        else if(a_layout == "C" && b_layout == "R")
+        {
+            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
+                argc, argv, Col{}, Row{}, Row{});
+        }
+        else if(a_layout == "C" && b_layout == "C")
+        {
+            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
+                argc, argv, Col{}, Col{}, Row{});
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported memory layout for the input matrices!");
+        }
+    }
+}
+
 int run_gemm_example(int argc, char* argv[])
 {
     auto [result, arg_parser] = create_args(argc, argv);
     if(!result)
         return -1;
 
-    using Row = ck_tile::tensor_layout::gemm::RowMajor;
-    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
-
     std::string data_type = arg_parser.get_str("prec");
     std::string a_layout  = arg_parser.get_str("a_layout");
     std::string b_layout  = arg_parser.get_str("b_layout");
 
-    if(a_layout == "R" && b_layout == "R")
+    if(data_type == "fp16")
     {
-        if(data_type == "fp16")
-        {
-            return run_gemm_example_with_layouts<ck_tile::half_t>(argc, argv, Row{}, Row{}, Row{});
-        }
-        else if(data_type == "bf16")
-        {
-            return run_gemm_example_with_layouts<ck_tile::bf16_t>(argc, argv, Row{}, Row{}, Row{});
-        }
-        else if(data_type == "fp8")
-        {
-            return run_gemm_example_with_layouts<ck_tile::fp8_t>(argc, argv, Row{}, Row{}, Row{});
-        }
-        else if(data_type == "bf8")
-        {
-            return run_gemm_example_with_layouts<ck_tile::bf8_t>(argc, argv, Row{}, Row{}, Row{});
-        }
-        else
-        {
-            throw std::runtime_error("Unsupported data_type!");
-        }
+        return run_gemm_example_prec_type<ck_tile::half_t>(a_layout, b_layout, argc, argv);
     }
-    else if(a_layout == "R" && b_layout == "C")
+    else if(data_type == "bf16")
     {
-        if(data_type == "fp16")
-        {
-            return run_gemm_example_with_layouts<ck_tile::half_t>(argc, argv, Row{}, Col{}, Row{});
-        }
-        else if(data_type == "bf16")
-        {
-            return run_gemm_example_with_layouts<ck_tile::bf16_t>(argc, argv, Row{}, Col{}, Row{});
-        }
-        else if(data_type == "fp8")
-        {
-            return run_gemm_example_with_layouts<ck_tile::fp8_t>(argc, argv, Row{}, Col{}, Row{});
-        }
-        else if(data_type == "bf8")
-        {
-            return run_gemm_example_with_layouts<ck_tile::bf8_t>(argc, argv, Row{}, Col{}, Row{});
-        }
+        return run_gemm_example_prec_type<ck_tile::bf16_t>(a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "fp8")
+    {
+        return run_gemm_example_prec_type<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "bf8")
+    {
+        return run_gemm_example_prec_type<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>(
+            a_layout, b_layout, argc, argv);
+    }
+
 #if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-        else if(data_type == "pk_int4_t")
-        {
-            // TODO: Add support for bhalf_t ADataType
-            return run_gemm_example_with_layouts<ck_tile::half_t,
-                                                 ck_tile::pk_int4_t,
-                                                 ck_tile::half_t>(argc, argv, Row{}, Col{}, Row{});
-        }
-#endif
-        else
-        {
-            throw std::runtime_error("Unsupported data_type!");
-        }
-    }
-    else if(a_layout == "C" && b_layout == "C")
+    else if(data_type == "pk_int4_t")
     {
-        if(data_type == "fp16")
-        {
-            return run_gemm_example_with_layouts<ck_tile::half_t>(argc, argv, Col{}, Col{}, Row{});
-        }
-        else if(data_type == "bf16")
-        {
-            return run_gemm_example_with_layouts<ck_tile::bf16_t>(argc, argv, Col{}, Col{}, Row{});
-        }
-        else if(data_type == "fp8")
-        {
-            return run_gemm_example_with_layouts<ck_tile::fp8_t>(argc, argv, Col{}, Col{}, Row{});
-        }
-        else if(data_type == "bf8")
-        {
-            return run_gemm_example_with_layouts<ck_tile::bf8_t>(argc, argv, Col{}, Col{}, Row{});
-        }
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-        else if(data_type == "pk_int4_t")
-        {
-            // TODO: Add support for bhalf_t ADataType
-            return run_gemm_example_with_layouts<ck_tile::half_t,
-                                                 ck_tile::pk_int4_t,
-                                                 ck_tile::half_t>(argc, argv, Col{}, Col{}, Row{});
-        }
+        // TODO: Add support for bhalf_t ADataType
+        return run_gemm_example_prec_type<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>(
+            a_layout, b_layout, argc, argv);
+    }
 #endif
-        else
-        {
-            throw std::runtime_error("Unsupported data_type!");
-        }
-    }
-    else if(a_layout == "C" && b_layout == "R")
-    {
-        if(data_type == "fp16")
-        {
-            return run_gemm_example_with_layouts<ck_tile::half_t>(argc, argv, Col{}, Row{}, Row{});
-        }
-        else if(data_type == "bf16")
-        {
-            return run_gemm_example_with_layouts<ck_tile::bf16_t>(argc, argv, Col{}, Row{}, Row{});
-        }
-        else if(data_type == "fp8")
-        {
-            return run_gemm_example_with_layouts<ck_tile::fp8_t>(argc, argv, Col{}, Row{}, Row{});
-        }
-        else if(data_type == "bf8")
-        {
-            return run_gemm_example_with_layouts<ck_tile::bf8_t>(argc, argv, Col{}, Row{}, Row{});
-        }
-        else
-        {
-            throw std::runtime_error("Unsupported data_type!");
-        }
-    }
     else
     {
-        throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!");
+        throw std::runtime_error("Unsupported data type for this operation !!!");
     }
 }
 
-int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+    try
+    {
+        run_gemm_example(argc, argv);
+    }
+    catch(const std::runtime_error& e)
+    {
+        std::cerr << "Caught runtime error: " << e.what() << '\n';
+        // Return a non-zero code to indicate failure
+        return EXIT_FAILURE;
+    }
+    return EXIT_SUCCESS;
+}
diff --git a/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp b/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
index 48c150009e..25598282e3 100644
--- a/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
+++ b/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
@@ -41,6 +41,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     using YDataType           = DataType;
     using GammaDataType       = DataType;
     using InvRmsDataType      = ck_tile::null_type;
+    using UnquantYDataType    = ck_tile::null_type;
     using SmoothScaleDataType = ck_tile::null_type;
     using YScaleDataType      = ck_tile::null_type;
 
@@ -55,6 +56,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     ck_tile::HostTensor<InvRmsDataType> invRms_host_ref({m});
 
+    ck_tile::HostTensor<UnquantYDataType> unquant_y_host_ref({m, n}, {stride, 1});
+
     ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
     ck_tile::FillUniformDistribution<GammaDataType>{-.5f, .5f}(gamma_host);
 
@@ -76,6 +79,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     using PipelineTraits =
         ck_tile::Rmsnorm2dFwdTraits<true,  // kPadN
                                     false, // kSaveInvRms
+                                    false, // kSaveUnquant
                                     kTwoPass,
                                     ck_tile::Rmsnorm2dFusedAddEnum::NO_ADD,      // fuse add
                                     ck_tile::Rmsnorm2dFusedQuantEnum::NO_SWEEP>; // fuse quant
@@ -85,6 +89,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                                          ComputeDataType,
                                                          YDataType,
                                                          InvRmsDataType,
+                                                         UnquantYDataType,
                                                          SmoothScaleDataType,
                                                          YScaleDataType,
                                                          Shape,
@@ -108,6 +113,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                        nullptr,
                                        nullptr,
                                        nullptr,
+                                       nullptr,
                                        epsilon,
                                        m,
                                        n,
@@ -135,8 +141,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                          GammaDataType,
                                          ComputeDataType,
                                          YDataType,
-                                         InvRmsDataType>(
-            x_host, gamma_host, y_host_ref, invRms_host_ref, epsilon);
+                                         InvRmsDataType,
+                                         UnquantYDataType>(
+            x_host, gamma_host, y_host_ref, invRms_host_ref, unquant_y_host_ref, epsilon);
 
         y_buf.FromDevice(y_host_dev.data());
 
diff --git a/example/ck_tile/10_rmsnorm2d/generate.py b/example/ck_tile/10_rmsnorm2d/generate.py
index dadb2268b2..39d42e5ff1 100644
--- a/example/ck_tile/10_rmsnorm2d/generate.py
+++ b/example/ck_tile/10_rmsnorm2d/generate.py
@@ -54,6 +54,7 @@ template <typename XDataType_,
           typename YDataType_,
           typename SmoothScaleDataType_,
           typename YScaleDataType_,
+          typename UnquantYDataType_,
           ck_tile::index_t Repeat_M_,         // each thread repeat along M
           ck_tile::index_t Repeat_N_,         // each thread repeat along N
           ck_tile::index_t ThreadPerBlock_M_, // num threads along M
@@ -61,6 +62,7 @@ template <typename XDataType_,
           ck_tile::index_t Vector_N_,         // vector size along N
           bool kPadN_,
           bool kSaveInvRms_,
+          bool kSaveUnquant_,
           bool kTwoPass_,
           ck_tile::index_t kFusedAdd_ = 0,
           ck_tile::index_t kFusedQuant_ = 0>
@@ -70,6 +72,7 @@ struct rmsnorm2d_fwd_traits_
     using YDataType           = ck_tile::remove_cvref_t<YDataType_>;
     using SmoothScaleDataType = ck_tile::remove_cvref_t<SmoothScaleDataType_>;
     using YScaleDataType      = ck_tile::remove_cvref_t<YScaleDataType_>;
+    using UnquantYDataType    = ck_tile::remove_cvref_t<UnquantYDataType_>;
 
     static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize;
     static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0);
@@ -120,9 +123,10 @@ struct rmsnorm2d_fwd_traits_
 
     using Shape = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
 
-    static constexpr bool kPadN       = kPadN_;
-    static constexpr bool kSaveInvRms = kSaveInvRms_;
-    static constexpr bool kTwoPass    = kTwoPass_;
+    static constexpr bool kPadN        = kPadN_;
+    static constexpr bool kSaveInvRms  = kSaveInvRms_;
+    static constexpr bool kSaveUnquant = kSaveUnquant_;
+    static constexpr bool kTwoPass     = kTwoPass_;
     static constexpr ck_tile::index_t kFusedAdd = kFusedAdd_;
     static constexpr ck_tile::index_t kFusedQuant = kFusedQuant_;
 };
@@ -131,6 +135,7 @@ template <typename XDataType_,
           typename YDataType_,
           typename SmoothScaleDataType_,
           typename YScaleDataType_,
+          typename UnquantYDataType_,
           ck_tile::index_t Repeat_M_,         // each thread repeat along M
           ck_tile::index_t Repeat_N_,         // each thread repeat along N
           ck_tile::index_t ThreadPerBlock_M_, // num threads along M
@@ -138,6 +143,7 @@ template <typename XDataType_,
           ck_tile::index_t Vector_N_,         // vector size along N
           bool kPadN_,
           bool kSaveInvRms_,
+          bool kSaveUnquant_,
           bool kTwoPass_,
           int  kFusedAdd_,
           int  kFusedQuant_>
@@ -145,6 +151,7 @@ using traits_ = rmsnorm2d_fwd_traits_<XDataType_,
                                       YDataType_,
                                       SmoothScaleDataType_,
                                       YScaleDataType_,
+                                      UnquantYDataType_,
                                       Repeat_M_,
                                       Repeat_N_,
                                       ThreadPerBlock_M_,
@@ -152,6 +159,7 @@ using traits_ = rmsnorm2d_fwd_traits_<XDataType_,
                                       Vector_N_,
                                       kPadN_,
                                       kSaveInvRms_,
+                                      kSaveUnquant_,
                                       kTwoPass_,
                                       kFusedAdd_,
                                       kFusedQuant_>;
@@ -180,11 +188,13 @@ float rmsnorm2d_fwd_(const S& s, A a)
     using YDataType           = typename Traits_::YDataType;
     using SmoothScaleDataType = typename Traits_::SmoothScaleDataType;
     using YScaleDataType      = typename Traits_::YScaleDataType;
+    using UnquantYDataType    = typename Traits_::UnquantYDataType;
     using ComputeDataType     = typename RmsnormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::ComputeDataType;
 
     using PipelineTraits =
         ck_tile::Rmsnorm2dFwdTraits<Traits_::kPadN,
                                     Traits_::kSaveInvRms,
+                                    Traits_::kSaveUnquant,
                                     Traits_::kTwoPass,
                                     static_cast<ck_tile::Rmsnorm2dFusedAddEnum>(Traits_::kFusedAdd),
                                     static_cast<ck_tile::Rmsnorm2dFusedQuantEnum>(Traits_::kFusedQuant)>;
@@ -195,6 +205,7 @@ float rmsnorm2d_fwd_(const S& s, A a)
                                              typename RmsnormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::ComputeDataType,
                                              typename RmsnormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::YDataType,
                                              typename RmsnormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::InvRmsDataType,
+                                             typename RmsnormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::UnquantYDataType,
                                              typename RmsnormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::SmoothScaleDataType,
                                              typename RmsnormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::YScaleDataType,
                                              typename Traits_::Shape,
@@ -213,7 +224,16 @@ float rmsnorm2d_fwd_(const S& s, A a)
 
     using DynamicQuantEpilogue = ck_tile::DynamicQuantEpilogue<DynamicQuantEpilogueProblem>;
 
-    using Epilogue = std::conditional_t<Traits_::kFusedQuant != 0, DynamicQuantEpilogue,  Default2DEpilogue>;
+    using Default2DAndDynamicQuantEpilogueProblem = ck_tile::Default2DAndDynamicQuantEpilogueProblem<
+        ComputeDataType, SmoothScaleDataType, YScaleDataType, YDataType, UnquantYDataType, typename Traits_::Shape,
+        ck_tile::Default2DAndDynamicQuantEpilogueTraits<false, Traits_::kPadN, UseSmoothInputScale, false,  true/*max3*/>>;
+    using Default2DAndDynamicQuantEpilogue = ck_tile::Default2DAndDynamicQuantEpilogue<Default2DAndDynamicQuantEpilogueProblem>;
+
+    using Epilogue = std::conditional_t<Traits_::kFusedQuant != 0,
+                                        std::conditional_t<Traits_::kSaveUnquant,
+                                                           Default2DAndDynamicQuantEpilogue,
+                                                           DynamicQuantEpilogue>,
+                                        Default2DEpilogue>;
 
     using Kernel = ck_tile::Rmsnorm2dFwd<Pipeline, Epilogue>;
 
@@ -355,6 +375,7 @@ float rmsnorm2d_fwd(rmsnorm2d_fwd_traits t,
         F_YDataType : str
         F_SmoothScaleDataType : str
         F_YScaleDataType : str
+        F_UnquantYDataType : str
         F_Repeat_M : int
         F_Repeat_N : int
         F_ThreadPerBlock_M : int
@@ -362,14 +383,15 @@ float rmsnorm2d_fwd(rmsnorm2d_fwd_traits t,
         F_Vector_N : int
         F_kPadN : bool
         F_kSaveInvRms : bool
+        F_kSaveUnquant: bool
         F_kTwoPass : bool
         F_kFusedAdd : int
         F_kFusedQuant : int
 
         @property
         def trait_name(self) ->str:
-            t_ = f'{DATA_TYPE_MAP[self.F_XDataType]}, {DATA_TYPE_MAP[self.F_YDataType]}, {DATA_TYPE_MAP[self.F_SmoothScaleDataType]}, {DATA_TYPE_MAP[self.F_YScaleDataType]}, {self.F_Repeat_M:2}, {self.F_Repeat_N:2}, {self.F_ThreadPerBlock_M:2}, {self.F_ThreadPerBlock_N:4}'
-            t_ += f', {self.F_Vector_N:2}, {BOOL_MAP(self.F_kPadN):5}, {BOOL_MAP(self.F_kSaveInvRms):5}'
+            t_ = f'{DATA_TYPE_MAP[self.F_XDataType]}, {DATA_TYPE_MAP[self.F_YDataType]}, {DATA_TYPE_MAP[self.F_SmoothScaleDataType]}, {DATA_TYPE_MAP[self.F_YScaleDataType]}, {DATA_TYPE_MAP[self.F_UnquantYDataType]}, {self.F_Repeat_M:2}, {self.F_Repeat_N:2}, {self.F_ThreadPerBlock_M:2}, {self.F_ThreadPerBlock_N:4}'
+            t_ += f', {self.F_Vector_N:2}, {BOOL_MAP(self.F_kPadN):5}, {BOOL_MAP(self.F_kSaveInvRms):5}, {BOOL_MAP(self.F_kSaveUnquant):5}'
             t_ += f', {BOOL_MAP(self.F_kTwoPass):5}, {self.F_kFusedAdd:4}, {self.F_kFusedQuant:4}'
             return t_
 
@@ -390,6 +412,7 @@ float rmsnorm2d_fwd(rmsnorm2d_fwd_traits t,
         F_N : str
         F_add : int
         F_sweep : int
+        F_saveunquant : bool
         instance_list : List[Any] # List[h_traits]
 
         @property
@@ -401,6 +424,8 @@ float rmsnorm2d_fwd(rmsnorm2d_fwd_traits t,
                 nnn = nnn + '_' + FUSED_ADD_ENUM_STR_MAP[self.F_add]
             if self.F_sweep != 0:
                 nnn = nnn + '_' + FUSED_FUSED_SWEEP_STR_MAP[self.F_sweep]
+            if self.F_saveunquant:
+                nnn = nnn + '_saveunquant'
             return nnn
 
         @property
@@ -451,11 +476,11 @@ float rmsnorm2d_fwd(rmsnorm2d_fwd_traits t,
                         if ins.F_kFusedQuant == 0:
                             _sweep_cond = 't.fused_quant == {f_fused_sweep}'.format(f_fused_sweep = ins.F_kFusedQuant)
                         elif ins.F_kFusedQuant == 1:
-                            _sweep_cond = 't.fused_quant == {f_fused_sweep} && (t.prec_sm == \"{f_sx_type}\" && t.prec_sy == \"{f_sy_type}\")'.format(
-                                f_fused_sweep = ins.F_kFusedQuant, f_sx_type=ins.F_SmoothScaleDataType, f_sy_type=ins.F_YScaleDataType)
+                            _sweep_cond = 't.fused_quant == {f_fused_sweep} && (t.prec_sm == \"{f_sx_type}\" && t.prec_sy == \"{f_sy_type}\" && t.save_unquant == {f_suq})'.format(
+                                f_fused_sweep = ins.F_kFusedQuant, f_sx_type=ins.F_SmoothScaleDataType, f_sy_type=ins.F_YScaleDataType, f_suq=BOOL_MAP(ins.F_kSaveUnquant))
                         elif ins.F_kFusedQuant == 2:
-                            _sweep_cond = 't.fused_quant == {f_fused_sweep} && (t.prec_sy == \"{f_sy_type}\")'.format(
-                                f_fused_sweep = ins.F_kFusedQuant, f_sy_type=ins.F_YScaleDataType)
+                            _sweep_cond = 't.fused_quant == {f_fused_sweep} && (t.prec_sy == \"{f_sy_type}\" && t.save_unquant == {f_suq})'.format(
+                                f_fused_sweep = ins.F_kFusedQuant, f_sy_type=ins.F_YScaleDataType, f_suq=BOOL_MAP(ins.F_kSaveUnquant))
                         _cond = '((a.n % {f_vec_n} == 0) && (t.fused_add == {f_fused_add}) && ({f_sweep_cond}))'.format(
                                         f_vec_n = ins.F_Vector_N, f_fused_add = ins.F_kFusedAdd,
                                         f_sweep_cond = _sweep_cond)
@@ -489,67 +514,72 @@ float rmsnorm2d_fwd(rmsnorm2d_fwd_traits t,
         #fused_sweep_list = [0, 1, 2] # NOTE: only single pass can use fused (smooth) dynamic quant
         fused_add_list = [0, 1]
         fused_sweep_list = [0, 1, 2] # NOTE: only single pass can use fused (smooth) dynamic quant
+        bool_list = [False, True]
 
-        #                                                       rm  rn  tm   tn  vn  pd     mv     2p     add    sweep
-        h_trait_dict = {'64'  : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  8,  8,  8,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  16, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 1,  True,  False, False,   0,    0)],
-                        '128' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  16, 8,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 1,  True,  False, False,   0,    0)],
-                        '256' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  4,  64, 1,  True,  False, False,   0,    0)],
-                        '512' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 8,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  4,  64, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  4,  64, 1,  True,  False, False,   0,    0)],
-                        '768' : [ h_traits('x', 'y', 'xs', 'ys', 1,  3,  4,  64, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  4,  64, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1, 12,  4,  64, 1,  True,  False, False,   0,    0)],
-                        '1024' :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  2, 128, 8,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  2, 128, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  2, 128, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 1,  True,  False, False,   0,    0)],
-                        '1536' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  4,  64, 8,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  2, 128, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1, 256, 1,  True,  False, False,   0,    0)],
-                        '2048' :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  1, 256, 8,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  1, 256, 1,  True,  False, False,   0,    0)],
-                        '3072' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 128, 8,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1, 256, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1,1024, 1,  True,  False, False,   0,    0)],
-                        '4096' :[ h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 8,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1,1024, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 1,  True,  False, False,   0,    0)],
-                        '6144' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 8,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 512, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1,1024, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1,1024, 1,  True,  False, False,   0,    0)],
-                        '8192' :[ h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 8,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 512, 4,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 2,  True,  False, False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  1,1024, 1,  True,  False, False,   0,    0)],
-                        'big'  :[ h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 8,  True,  False,  True,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 4,  True,  False,  True,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1,1024, 2,  True,  False,  True,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 1,  True,  False,  True,   0,    0)]}
+        #                                                              rm  rn  tm   tn  vn  pd     mv     unquant  2p     add    sweep
+        h_trait_dict = {'64'  : [ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  8,  8,  8,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  16, 4,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 1,  True,  False, False,   False,   0,    0)],
+                        '128' : [ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  16, 8,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 2,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  4,  64, 1,  True,  False, False,   False,   0,    0)],
+                        '256' : [ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 4,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  4,  64, 2,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  4,  64, 1,  True,  False, False,   False,   0,    0)],
+                        '512' : [ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 8,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  4,  64, 4,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  4,  64, 2,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  4,  64, 1,  True,  False, False,   False,   0,    0)],
+                        '640' : [ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  5,  4,  64, 2,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  5,  4, 128, 1,  True,  False, False,   False,   0,    0)],
+                        '768' : [ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  4,  64, 4,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  4,  64, 2,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1, 12,  4,  64, 1,  True,  False, False,   False,   0,    0)],
+                        '1024' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  2,  64, 8,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  2,  64, 4,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  2,  64, 2,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 1,  True,  False, False,   False,   0,    0)],
+                        '1536' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  4,  64, 8,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  2, 128, 4,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 256, 2,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  1, 256, 1,  True,  False, False,   False,   0,    0)],
+                        '2048' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  1, 256, 8,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  1, 256, 4,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 2,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  1, 256, 1,  True,  False, False,   False,   0,    0)],
+                        '3072' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 128, 8,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 256, 4,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  1, 256, 2,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1,1024, 1,  True,  False, False,   False,   0,    0)],
+                        '4096' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  1, 256, 8,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 4,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  1,1024, 2,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1,1024, 1,  True,  False, False,   False,   0,    0)],
+                        '6144' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 256, 8,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 512, 4,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1,1024, 2,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  1,1024, 1,  True,  False, False,   False,   0,    0)],
+                        '8192' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 8,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 512, 4,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1,1024, 2,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  1,1024, 1,  True,  False, False,   False,   0,    0)],
+                        'big'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  1,1024, 8,  True,  False, False,    True,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 4,  True,  False, False,    True,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1, 12,  1, 256, 2,  True,  False, False,    True,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1,1024, 1,  True,  False, False,    True,   0,    0)]}
         total_blob = list()
         for hs_key in h_trait_dict:
             hs = h_trait_dict[hs_key]
             current_n = hs[0].F_Repeat_N * hs[0].F_ThreadPerBlock_N * hs[0].F_Vector_N
-            for dtype, scale_type, fused_add, fused_quant in itertools.product(dtype_list, scale_list, fused_add_list, fused_sweep_list):
+            for dtype, scale_type, fused_add, fused_quant, save_unquant in itertools.product(dtype_list, scale_list, fused_add_list, fused_sweep_list, bool_list):
                 prec_i, prec_o = dtype.split(',')
                 scale_sm, scale_y = scale_type.split(',')
                 if prec_o in dynamic_quant_out_dtype and fused_quant != 1 and fused_quant != 2:
                     continue # skip non dynamic quant case
                 if (fused_quant == 1 or fused_quant == 2) and hs_key == 'big':
                     continue
+                if (fused_quant == 0 and save_unquant == True):
+                    continue # save_unquant should always be false when there is no quant enabled
                 current_hs = list()
                 for chs_ in hs:
                     h_ = copy.copy(chs_) # copy the base instance out
@@ -557,12 +587,14 @@ float rmsnorm2d_fwd(rmsnorm2d_fwd_traits t,
                     h_.F_YDataType = prec_o
                     h_.F_SmoothScaleDataType = scale_sm
                     h_.F_YScaleDataType = scale_y
+                    h_.F_UnquantYDataType = prec_i
                     h_.F_kFusedAdd = fused_add
                     h_.F_kFusedQuant = fused_quant
+                    h_.F_kSaveUnquant = save_unquant
                     current_hs.append(h_) # + "\n"
                 #f.write(str(f.parent / GEN_DIR / (blobs.api_common_header_
                 current_n_str = 'big' if hs_key == 'big' else current_n
-                total_blob.append(h_instance(dtype, current_n_str, fused_add, fused_quant, current_hs))
+                total_blob.append(h_instance(dtype, current_n_str, fused_add, fused_quant, save_unquant, current_hs))
         return total_blob
 
     def list_blobs(self) -> None:
diff --git a/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp
index cdee6dfb80..d5be4384ab 100644
--- a/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp
+++ b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp
@@ -38,6 +38,7 @@ auto create_args(int argc, char* argv[])
         .insert("yr_stride", "-1", "y residule row_stride, if -1 then equal to n")
         .insert("e", "1e-5", "epsilon")
         .insert("save_rms", "0", "save rms(invrms) or not. set to 1 in training case")
+        .insert("save_unquant", "0", "save result before quant")
         .insert("v", "1", "cpu validation or not")
         .insert("kname", "1", "print kernel name or not")
         .insert("prec_i", "fp16", "input precision")
@@ -61,7 +62,8 @@ template <typename InDataType,
           typename OutDataType,
           typename SmoothScaleDataType,
           typename YScaleDataType,
-          bool SaveRms>
+          bool SaveRms,
+          bool SaveUnquant>
 bool run(const ck_tile::ArgParser& arg_parser)
 {
     ck_tile::index_t m = arg_parser.get_int("m");
@@ -113,6 +115,14 @@ bool run(const ck_tile::ArgParser& arg_parser)
         return false;
     }
 
+    if((fused_quant == 0) && SaveUnquant)
+    {
+        std::cout
+            << "save_unquant should be 0 if quant output is not enabled because it is meaningless. "
+            << "Output Y is what wanted." << std::endl;
+        return false;
+    }
+
     using TypeConfig =
         RmsnormTypeConfig<InDataType, OutDataType, SmoothScaleDataType, YScaleDataType>;
 
@@ -124,6 +134,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     using InvRmsDataType =
         std::conditional_t<SaveRms, typename TypeConfig::InvRmsDataType, ck_tile::null_type>;
+    using UnquantYDataType =
+        std::conditional_t<SaveUnquant, typename TypeConfig::UnquantYDataType, ck_tile::null_type>;
 
     using ComputeDataType = typename TypeConfig::ComputeDataType;
 
@@ -143,6 +155,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     ck_tile::HostTensor<InvRmsDataType> invRms_host_ref({m});
 
+    ck_tile::HostTensor<UnquantYDataType> unquant_y_host_ref({m, n}, {y_stride, 1});
+    ck_tile::HostTensor<UnquantYDataType> unquant_y_host_dev({m, n}, {y_stride, 1});
+    ck_tile::HostTensor<ck_tile::null_type> unquant_y_null({1});
+
     ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
     ck_tile::FillUniformDistribution<XResidualDataType>{-.5f, .5f}(x_residual_host);
     ck_tile::FillUniformDistribution<SmoothScaleDataType>{-1.f, 1.f}(sm_scale_host);
@@ -155,6 +171,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     ck_tile::DeviceMem sm_scale_buf(sm_scale_host_dev.get_element_space_size_in_bytes());
     ck_tile::DeviceMem x_residual_buf(x_residual_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem y_residual_buf(y_residual_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem unquant_y_buf(unquant_y_host_dev.get_element_space_size_in_bytes());
 
     x_buf.ToDevice(x_host.data());
     gamma_buf.ToDevice(gamma_host.data());
@@ -179,7 +196,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
               << ", xr_stride:" << xr_stride << ", y_stride:" << y_stride
               << ", yr_stride:" << yr_stride << std::flush;
 
-    rmsnorm2d_fwd_traits traits{prec_i, prec_o, prec_sm, prec_sy, SaveRms, fused_add, fused_quant};
+    rmsnorm2d_fwd_traits traits{
+        prec_i, prec_o, prec_sm, prec_sy, SaveRms, SaveUnquant, fused_add, fused_quant};
 
     rmsnorm2d_fwd_args args{x_buf.GetDeviceBuffer(),
                             fused_add != 0 ? x_residual_buf.GetDeviceBuffer() : nullptr,
@@ -189,6 +207,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                             fused_add == 1 ? y_residual_buf.GetDeviceBuffer() : nullptr,
                             fused_quant != 0 ? y_scale_buf.GetDeviceBuffer() : nullptr,
                             nullptr, // p_invRms, unsupported yet
+                            SaveUnquant ? unquant_y_buf.GetDeviceBuffer() : nullptr,
                             epsilon,
                             m,
                             n,
@@ -203,6 +222,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     std::size_t num_byte =
         sizeof(XDataType) * m * n + sizeof(GammaDataType) * n + sizeof(YDataType) * m * n;
     num_byte += SaveRms ? sizeof(InvRmsDataType) * m * n : 0;
+    num_byte += SaveUnquant ? sizeof(UnquantYDataType) * m * n : 0;
     num_byte += fused_add ? sizeof(XResidualDataType) * m * n : 0;
     num_byte += ((fused_quant == 1) || (fused_quant == 2)) ? sizeof(YScaleDataType) * m : 0;
     num_byte += (fused_quant == 1) ? sizeof(SmoothScaleDataType) * n : 0;
@@ -262,21 +282,57 @@ bool run(const ck_tile::ArgParser& arg_parser)
                 }
             };
 
-            ck_tile::reference_rmsnorm2d_fwd<XDataType,
-                                             GammaDataType,
-                                             ComputeDataType,
-                                             YDataType,
-                                             InvRmsDataType>(
-                x_host, gamma_host, y_host_ref, invRms_host_ref, epsilon, dquant_functor);
+            auto default_and_dquant_functor = [&](int m_, auto& o_unquant_, auto& o_, auto& acc_) {
+                const int N = acc_.mDesc.get_lengths()[1];
+                for(int n_ = 0; n_ < N; ++n_)
+                {
+                    o_unquant_(m_, n_) = ck_tile::type_convert<OutDataType>(acc_(m_, n_));
+                }
+
+                dquant_functor(m_, o_, acc_);
+            };
+
+            if constexpr(SaveUnquant)
+            {
+                ck_tile::reference_rmsnorm2d_fwd<XDataType,
+                                                 GammaDataType,
+                                                 ComputeDataType,
+                                                 YDataType,
+                                                 InvRmsDataType,
+                                                 UnquantYDataType>(x_host,
+                                                                   gamma_host,
+                                                                   y_host_ref,
+                                                                   invRms_host_ref,
+                                                                   unquant_y_host_ref,
+                                                                   epsilon,
+                                                                   default_and_dquant_functor);
+            }
+            else
+            {
+                ck_tile::reference_rmsnorm2d_fwd<XDataType,
+                                                 GammaDataType,
+                                                 ComputeDataType,
+                                                 YDataType,
+                                                 InvRmsDataType,
+                                                 UnquantYDataType>(x_host,
+                                                                   gamma_host,
+                                                                   y_host_ref,
+                                                                   invRms_host_ref,
+                                                                   unquant_y_host_ref,
+                                                                   epsilon,
+                                                                   dquant_functor);
+            }
         }
         else
         {
+            assert(SaveUnquant == false);
             ck_tile::reference_rmsnorm2d_fwd<XDataType,
                                              GammaDataType,
                                              ComputeDataType,
                                              YDataType,
-                                             InvRmsDataType>(
-                x_host, gamma_host, y_host_ref, invRms_host_ref, epsilon);
+                                             InvRmsDataType,
+                                             ck_tile::null_type>(
+                x_host, gamma_host, y_host_ref, invRms_host_ref, unquant_y_null, epsilon);
         }
 
         y_buf.FromDevice(y_host_dev.data());
@@ -293,6 +349,15 @@ bool run(const ck_tile::ArgParser& arg_parser)
             pass = ck_tile::check_err(
                 y_host_dev, y_host_ref, std::string("\nOUT Error: Incorrect results!"), rtol, atol);
 
+            if constexpr(SaveUnquant)
+            {
+                pass &= ck_tile::check_err(unquant_y_host_dev,
+                                           unquant_y_host_ref,
+                                           std::string("\n OUT ERROR: Incorrect unquant results!"),
+                                           rtol,
+                                           atol);
+            }
+
             if(fused_add == 1)
             {
                 pass &= ck_tile::check_err(y_residual_host_dev,
@@ -331,6 +396,23 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                                rtol,
                                                atol);
                 }
+
+                if constexpr(SaveUnquant)
+                {
+                    std::vector<UnquantYDataType> unquant_y_host_dev_row(
+                        unquant_y_host_dev.begin() + i_r * y_stride,
+                        unquant_y_host_dev.begin() + i_r * y_stride + n);
+                    std::vector<UnquantYDataType> unquant_y_host_ref_row(
+                        unquant_y_host_ref.begin() + i_r * y_stride,
+                        unquant_y_host_ref.begin() + i_r * y_stride + n);
+                    pass &=
+                        ck_tile::check_err(unquant_y_host_dev_row,
+                                           unquant_y_host_ref_row,
+                                           std::string("\nOUT[") + std::to_string(i_r) +
+                                               std::string("] Error: Incorrect unquant y results!"),
+                                           rtol,
+                                           atol);
+                }
             }
         }
 
@@ -350,6 +432,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
     return pass;
 }
 
+bool is_quant_data_type(const std::string& prec) { return (prec == "int8") || (prec == "fp8"); }
+
 int main(int argc, char* argv[])
 {
     auto [result, arg_parser] = create_args(argc, argv);
@@ -373,48 +457,79 @@ int main(int argc, char* argv[])
         prec_sy = "fp32";
     }
 
-    int save_rms = arg_parser.get_int("save_rms");
+    int save_rms    = arg_parser.get_int("save_rms");
+    int fused_quant = arg_parser.get_int("fquant");
+    int save_unquant =
+        arg_parser.get_int("save_unquant") && is_quant_data_type(prec_o) && (fused_quant != 0);
 
     if(prec_i == "fp16" && prec_o == "fp16" && prec_sm == "fp32" && prec_sy == "fp32" && save_rms)
     {
-        return run<ck_tile::half_t, ck_tile::half_t, float, float, true>(arg_parser) ? 0 : -2;
+        return run<ck_tile::half_t, ck_tile::half_t, float, float, true, false>(arg_parser) ? 0
+                                                                                            : -2;
     }
     else if(prec_i == "fp16" && prec_o == "fp16" && prec_sm == "fp32" && prec_sy == "fp32" &&
             !save_rms)
     {
-        return run<ck_tile::half_t, ck_tile::half_t, float, float, false>(arg_parser) ? 0 : -2;
+        return run<ck_tile::half_t, ck_tile::half_t, float, float, false, false>(arg_parser) ? 0
+                                                                                             : -2;
     }
     else if(prec_i == "bf16" && prec_o == "bf16" && prec_sm == "fp32" && prec_sy == "fp32" &&
             save_rms)
     {
-        return run<ck_tile::bf16_t, ck_tile::bf16_t, float, float, true>(arg_parser) ? 0 : -2;
+        return run<ck_tile::bf16_t, ck_tile::bf16_t, float, float, true, false>(arg_parser) ? 0
+                                                                                            : -2;
     }
     else if(prec_i == "bf16" && prec_o == "bf16" && prec_sm == "fp32" && prec_sy == "fp32" &&
             !save_rms)
     {
-        return run<ck_tile::bf16_t, ck_tile::bf16_t, float, float, true>(arg_parser) ? 0 : -2;
+        return run<ck_tile::bf16_t, ck_tile::bf16_t, float, float, false, false>(arg_parser) ? 0
+                                                                                             : -2;
     }
 
     // dynamic quant case, only in inference
     else if(prec_i == "fp16" && prec_o == "int8" && prec_sm == "fp32" && prec_sy == "fp32" &&
-            !save_rms)
+            !save_rms && !save_unquant)
     {
-        return run<ck_tile::half_t, ck_tile::int8_t, float, float, true>(arg_parser) ? 0 : -2;
+        return run<ck_tile::half_t, ck_tile::int8_t, float, float, true, false>(arg_parser) ? 0
+                                                                                            : -2;
     }
     else if(prec_i == "bf16" && prec_o == "int8" && prec_sm == "fp32" && prec_sy == "fp32" &&
-            !save_rms)
+            !save_rms && !save_unquant)
     {
-        return run<ck_tile::bf16_t, ck_tile::int8_t, float, float, true>(arg_parser) ? 0 : -2;
+        return run<ck_tile::bf16_t, ck_tile::int8_t, float, float, true, false>(arg_parser) ? 0
+                                                                                            : -2;
     }
     else if(prec_i == "fp16" && prec_o == "fp8" && prec_sm == "fp32" && prec_sy == "fp32" &&
-            !save_rms)
+            !save_rms && !save_unquant)
     {
-        return run<ck_tile::half_t, ck_tile::fp8_t, float, float, false>(arg_parser) ? 0 : -2;
+        return run<ck_tile::half_t, ck_tile::fp8_t, float, float, false, false>(arg_parser) ? 0
+                                                                                            : -2;
     }
     else if(prec_i == "bf16" && prec_o == "fp8" && prec_sm == "fp32" && prec_sy == "fp32" &&
-            !save_rms)
+            !save_rms && !save_unquant)
     {
-        return run<ck_tile::bf16_t, ck_tile::fp8_t, float, float, false>(arg_parser) ? 0 : -2;
+        return run<ck_tile::bf16_t, ck_tile::fp8_t, float, float, false, false>(arg_parser) ? 0
+                                                                                            : -2;
+    }
+    else if(prec_i == "fp16" && prec_o == "int8" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_rms && save_unquant)
+    {
+        return run<ck_tile::half_t, ck_tile::int8_t, float, float, true, true>(arg_parser) ? 0 : -2;
+    }
+    else if(prec_i == "bf16" && prec_o == "int8" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_rms && save_unquant)
+    {
+        return run<ck_tile::bf16_t, ck_tile::int8_t, float, float, true, true>(arg_parser) ? 0 : -2;
+    }
+    else if(prec_i == "fp16" && prec_o == "fp8" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_rms && save_unquant)
+    {
+        return run<ck_tile::half_t, ck_tile::fp8_t, float, float, false, true>(arg_parser) ? 0 : -2;
+    }
+    else if(prec_i == "bf16" && prec_o == "fp8" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_rms && save_unquant)
+    {
+        return run<ck_tile::bf16_t, ck_tile::fp8_t, float, float, false, true>(arg_parser) ? 0 : -2;
     }
 
     return -3;
diff --git a/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp
index 566b94442d..bb4a2f5ef4 100644
--- a/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp
+++ b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp
@@ -21,6 +21,7 @@ struct RmsnormTypeConfig<ck_tile::half_t, OutType, SmoothScaleDataType_, YScaleD
     using YDataType           = OutType;
     using GammaDataType       = ck_tile::half_t;
     using InvRmsDataType      = ck_tile::half_t;
+    using UnquantYDataType    = ck_tile::half_t;
     using ComputeDataType     = float;
     using SmoothScaleDataType = SmoothScaleDataType_;
     using YScaleDataType      = YScaleDataType_;
@@ -33,6 +34,7 @@ struct RmsnormTypeConfig<ck_tile::bf16_t, OutType, SmoothScaleDataType_, YScaleD
     using YDataType           = OutType;
     using GammaDataType       = ck_tile::bf16_t;
     using InvRmsDataType      = ck_tile::bf16_t;
+    using UnquantYDataType    = ck_tile::bf16_t;
     using ComputeDataType     = float;
     using SmoothScaleDataType = SmoothScaleDataType_;
     using YScaleDataType      = YScaleDataType_;
@@ -59,6 +61,7 @@ struct rmsnorm2d_fwd_traits
     std::string prec_sy; // y-scale, used for [M*1] output for next layer
 
     bool save_rms;
+    bool save_unquant;
     int fused_add;   // 0:no-add, 1:pre-add-store, 2:pre-add
     int fused_quant; // 0:no-sweep, 1:smooth-dynamic-quant, 2:dynamic-quant
 };
diff --git a/example/ck_tile/10_rmsnorm2d/script/smoke_test.sh b/example/ck_tile/10_rmsnorm2d/script/smoke_test.sh
index ab890738b3..2bad7a00ea 100755
--- a/example/ck_tile/10_rmsnorm2d/script/smoke_test.sh
+++ b/example/ck_tile/10_rmsnorm2d/script/smoke_test.sh
@@ -1,7 +1,8 @@
 #!/bin/sh
 EXE="$(find . -name tile_rmsnorm2d_fwd -type f | head -n 1)"
 
-for fquant in "" "-fquant=1 -prec_o=int8" "-fquant=2 -prec_o=int8" "-fquant=1 -prec_o=fp8" "-fquant=2 -prec_o=fp8"; do
+for fquant in "" "-fquant=1 -prec_o=int8" "-fquant=2 -prec_o=int8" "-fquant=1 -prec_o=fp8" "-fquant=2 -prec_o=fp8"\
+  "-fquant=1 -prec_o=int8 -save_unquant=1" "-fquant=2 -prec_o=int8 -save_unquant=1" "-fquant=1 -prec_o=fp8 -save_unquant=1" "-fquant=2 -prec_o=fp8 -save_unquant=1"; do
 for pr_i in "fp16" "bf16" ; do
 for fadd in "0" "1"; do
 $EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=99  -n=13
@@ -27,6 +28,14 @@ $EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=7   -n=2734
 $EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=1   -n=3182
 $EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=9   -n=4096
 $EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=3   -n=8192
+done
+done
+done
+
+# The following cases uses two pass pipeline which doesn't support quant epilogue. 
+for fquant in ""
+for pr_i in "fp16" "bf16" ; do
+for fadd in "0" "1"; do
 $EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=1   -n=10547
 #$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=3   -n=17134
 done
diff --git a/example/ck_tile/13_moe_sorting/moe_sorting.cpp b/example/ck_tile/13_moe_sorting/moe_sorting.cpp
index c4faa35e33..e59fcaedad 100644
--- a/example/ck_tile/13_moe_sorting/moe_sorting.cpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting.cpp
@@ -74,7 +74,7 @@ bool test_moe_sorting(ck_tile::ArgParser args)
     int topk                = args.get_int("k");
     int seed                = args.get_int("seed");
     int unit_size           = args.get_int("unit");
-    int moe_buf_size        = args.get_int("moe_buf_size");
+    int64_t moe_buf_size    = static_cast<int64_t>(args.get_uint64("moe_buf_size"));
     int kname               = args.get_int("kname");
     int warmup              = args.get_int("warmup");
     int repeat              = args.get_int("repeat");
@@ -152,6 +152,13 @@ bool test_moe_sorting(ck_tile::ArgParser args)
     if(local_expert_masking)
         local_expert_masking_dev.ToDevice(local_expert_masking_host.data());
 
+    // if return zero, means no need workspace, can set moe_sorting_args.p_ws to nullptr
+    ck_tile::index_t workspace_size = moe_sorting_get_workspace_size(tokens, num_experts);
+    ck_tile::DeviceMem moe_sorting_ws(workspace_size != 0 ? workspace_size : 0);
+
+    if(workspace_size != 0)
+        moe_sorting_ws.SetZero(); // note, clear here!!!!
+
     moe_sorting_trait trait{index_prec, weight_prec, local_expert_masking};
 
     moe_sorting_args karg{topk_ids_dev.GetDeviceBuffer(),
@@ -163,24 +170,80 @@ bool test_moe_sorting(ck_tile::ArgParser args)
                           sorted_expert_ids_dev.GetDeviceBuffer(),
                           sorted_id_cnt_dev.GetDeviceBuffer(),
                           moe_buf_size > 0 ? moe_buf_dev.GetDeviceBuffer() : nullptr,
+                          workspace_size != 0 ? moe_sorting_ws.GetDeviceBuffer() : nullptr,
                           tokens,
                           unit_size,
                           num_experts,
                           topk,
-                          static_cast<ck_tile::index_t>(moe_buf_size * sizeof(float))};
+                          static_cast<ck_tile::long_index_t>(moe_buf_size * sizeof(float))};
 
     ck_tile::stream_config sc{nullptr,
                               true,
                               /* log_level = */ (kname ? 1 : 0),
                               warmup,
                               repeat};
+
     auto ms = moe_sorting(trait, karg, sc);
-    printf("[%s|%s]tokens:%d, num_experts:%d, topk:%d, ",
+    // auto ms = moe_sorting_mp(trait, karg, sc);
+
+#if 0
+    {
+        ck_tile::HostTensor<char> ws_host({workspace_size}, {1});
+        moe_sorting_ws.FromDevice(ws_host.data());
+
+        int * p_mesh = reinterpret_cast<int*>(ws_host.data());
+        ck_tile::index_t row_size = ck_tile::impl::moe_sorting_mp_mesh_stride(tokens);
+
+        std::cout << "topk_ids:" << std::endl;
+
+        int * p_topk_ids = reinterpret_cast<int*>(topk_ids_host.data());
+        for(int i_token = 0; i_token < tokens; i_token++) {
+            printf("[t:%2d]", i_token);
+            for(int i_topk = 0; i_topk < topk; i_topk++) {
+                printf("%d, ",p_topk_ids[i_token * topk + i_topk] );
+            }
+            printf("\n");
+        }
+        printf("----------------\n");
+
+        std::vector<int> l_cumsum (num_experts + 1, 0);
+        for(int i_expert = 0; i_expert < num_experts; i_expert++  ) {
+            printf("[e:%2d]", i_expert);
+            int e_cnt = 0;
+            for(int i_token = 0; i_token < tokens; i_token++) {
+                auto v_mesh = p_mesh[i_expert * row_size + i_token];
+                e_cnt += v_mesh != 0 ? 1 : 0;
+                printf("%d, ", v_mesh); 
+            }
+            int e_cnt_unit = (e_cnt + unit_size - 1) / unit_size;
+            printf("[%d/%d]", e_cnt, e_cnt_unit);
+            printf("\n");
+            l_cumsum[i_expert + 1] = l_cumsum[i_expert] + e_cnt_unit;
+        }
+
+        printf("----------------\n");
+        printf("cumsum:\n");
+        for(int i_cc= 0; i_cc < num_experts + 1; i_cc++) {
+            printf("%2d, ", l_cumsum[i_cc]);
+        }
+        printf("\n");
+        printf("----------------\n");
+
+        int * p_cumsum = p_mesh + ck_tile::impl::moe_sorting_mp_mesh_elem(tokens, num_experts);
+        for(int i_expert = 0; i_expert < num_experts + 1; i_expert++  ) {
+            printf("%2d(%d), ",p_cumsum[i_expert], p_cumsum[i_expert] / unit_size);
+        }
+        printf("\n");
+    }
+#endif
+
+    printf("[%s|%s]tokens:%d, num_experts:%d, topk:%d, mp:%d, ",
            index_prec.c_str(),
            weight_prec.c_str(),
            tokens,
            num_experts,
-           topk);
+           topk,
+           workspace_size != 0 ? 1 : 0);
 
     if(local_expert_masking)
     {
@@ -224,28 +287,41 @@ bool test_moe_sorting(ck_tile::ArgParser args)
                                                               num_experts,
                                                               unit_size,
                                                               local_expert_masking);
-        rtn &= ck_tile::check_err(
-            sorted_ids_host, sorted_ids_ref, std::string("OUT Error: Incorrect ids!"), 1e-6, 1e-6);
-        rtn &= ck_tile::check_err(sorted_weights_host,
-                                  sorted_weights_ref,
-                                  std::string("OUT Error: Incorrect w!"),
-                                  1e-6,
-                                  1e-6);
-        rtn &= ck_tile::check_err(sorted_expert_ids_host,
-                                  sorted_expert_ids_ref,
-                                  std::string("OUT Error: Incorrect eid!"),
-                                  1e-6,
-                                  1e-6);
+        printf("total_tokens_post_pad:%d(%d), ",
+               ref_total_tokens_post_pad,
+               sorted_id_cnt_host.mData[0]);
+        if(ref_total_tokens_post_pad == sorted_id_cnt_host.mData[0])
+        {
+            size_t slen = ref_total_tokens_post_pad;
+            rtn &= ck_tile::check_err(sorted_ids_host.slice({0}, {slen}),
+                                      sorted_ids_ref.slice({0}, {slen}),
+                                      std::string("OUT Error: Incorrect ids!"),
+                                      1e-6,
+                                      1e-6);
+            rtn &= ck_tile::check_err(sorted_weights_host.slice({0}, {slen}),
+                                      sorted_weights_ref.slice({0}, {slen}),
+                                      std::string("OUT Error: Incorrect w!"),
+                                      1e-6,
+                                      1e-6);
+            rtn &= ck_tile::check_err(sorted_expert_ids_host.slice({0}, {slen / unit_size}),
+                                      sorted_expert_ids_ref.slice({0}, {slen / unit_size}),
+                                      std::string("OUT Error: Incorrect eid!"),
+                                      1e-6,
+                                      1e-6);
+        }
+        else
+        {
+            printf("(token size not equal!!)");
+            rtn = false;
+        }
+
         if(moe_buf_size)
         {
             ck_tile::HostTensor<WeightType> moe_buf_ref({moe_buf_size});
             rtn &= ck_tile::check_err(
                 moe_buf_host, moe_buf_ref, std::string("OUT Error: Incorrect zero buf!"), 0, 0);
         }
-        rtn &= ref_total_tokens_post_pad == sorted_id_cnt_host.mData[0];
-        printf("total_tokens_post_pad:%d(%d), ",
-               ref_total_tokens_post_pad,
-               sorted_id_cnt_host.mData[0]);
+        // rtn &= ref_total_tokens_post_pad == sorted_id_cnt_host.mData[0];
     }
 
     printf("valid:%s", rtn ? "y" : "n");
diff --git a/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
index abff24a669..109ec1b157 100644
--- a/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
@@ -153,18 +153,106 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         }
         }
 #else
-        using index_t            = ck_tile::index_t;
-        using ms_weight_type     = float;
-        auto [r_, c_]            = ck_tile::moe_sorting_get_smem_row_col(a.tokens, a.num_experts);
-        auto sub_token_          = r_ - 2;
-        r_                       = (r_ - 2) / 8;
-        bool is_sub_token_onshot = a.tokens <= sub_token_;
+        if(moe_sorting_get_workspace_size(a.tokens, a.num_experts) != 0)
+        {
+            return moe_sorting_mp(t, a, s);
+        }
+        using index_t                = ck_tile::index_t;
+        using ms_weight_type         = float;
+        auto sub_token_              = ck_tile::moe_sorting_get_sub_token(a.tokens, a.num_experts);
+        auto row_                    = sub_token_ / 8;
+        bool is_sub_token_onshot     = a.tokens <= sub_token_;
         bool is_local_expert_masking = t.local_expert_masking;
-        (void)c_;
 
-        MOE_SORTING_DISPATCH_EMASK_(r_);
+        MOE_SORTING_DISPATCH_EMASK_(row_);
         // MOE_SORTING_DISPATCH_ETILE(0, 0);
 #endif
     }
     return -1;
 }
+
+#define MOE_SORTING_MP_0(unroll_num_, expert_masking_)                                            \
+    [&]() {                                                                                       \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                      \
+        constexpr bool expert_masking         = expert_masking_;                                  \
+        using ms_problem =                                                                        \
+            ck_tile::MoeSortingProblemMp<ms_index_t, ms_weight_type, unroll_num, expert_masking>; \
+        using kernel      = ck_tile::MoeSortingMultiPhaseKernel_P0<ms_problem>;                   \
+        auto kargs        = kernel::MakeKargs(a);                                                 \
+        const dim3 grids  = kernel::GridSize(a);                                                  \
+        const dim3 blocks = kernel::BlockSize(a);                                                 \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                           \
+    }()
+
+#define MOE_SORTING_MP_1(unroll_num_, expert_masking_)                                            \
+    [&]() {                                                                                       \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                      \
+        constexpr bool expert_masking         = expert_masking_;                                  \
+        using ms_problem =                                                                        \
+            ck_tile::MoeSortingProblemMp<ms_index_t, ms_weight_type, unroll_num, expert_masking>; \
+        using kernel      = ck_tile::MoeSortingMultiPhaseKernel_P1<ms_problem>;                   \
+        auto kargs        = kernel::MakeKargs(a);                                                 \
+        const dim3 grids  = kernel::GridSize(a);                                                  \
+        const dim3 blocks = kernel::BlockSize(a);                                                 \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                           \
+    }()
+
+#define MOE_SORTING_MP_2(unroll_num_, expert_masking_)                                            \
+    [&]() {                                                                                       \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                      \
+        constexpr bool expert_masking         = expert_masking_;                                  \
+        using ms_problem =                                                                        \
+            ck_tile::MoeSortingProblemMp<ms_index_t, ms_weight_type, unroll_num, expert_masking>; \
+        using kernel      = ck_tile::MoeSortingMultiPhaseKernel_P2<ms_problem>;                   \
+        auto kargs        = kernel::MakeKargs(a);                                                 \
+        const dim3 grids  = kernel::GridSize(a);                                                  \
+        const dim3 blocks = kernel::BlockSize(a);                                                 \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                           \
+    }()
+
+#define MOE_SORTING_MP_3(unroll_num_, expert_masking_)                                            \
+    [&]() {                                                                                       \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                      \
+        constexpr bool expert_masking         = expert_masking_;                                  \
+        using ms_problem =                                                                        \
+            ck_tile::MoeSortingProblemMp<ms_index_t, ms_weight_type, unroll_num, expert_masking>; \
+        using kernel      = ck_tile::MoeSortingMultiPhaseKernel_P3<ms_problem>;                   \
+        auto kargs        = kernel::MakeKargs(a);                                                 \
+        const dim3 grids  = kernel::GridSize(a);                                                  \
+        const dim3 blocks = kernel::BlockSize(a);                                                 \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                           \
+    }()
+
+float moe_sorting_mp(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s)
+{
+    if(t.weight_type == "fp32" && t.index_type == "int32")
+    {
+        using ms_index_t     = ck_tile::index_t;
+        using ms_weight_type = float;
+
+        if(t.local_expert_masking)
+        {
+            float ave_time = ck_tile::launch_kernel(s,
+                                                    MOE_SORTING_MP_0(1, true),
+                                                    MOE_SORTING_MP_1(1, true),
+                                                    MOE_SORTING_MP_2(1, true),
+                                                    MOE_SORTING_MP_3(1, true));
+            return ave_time;
+        }
+        else
+        {
+            float ave_time = ck_tile::launch_kernel(s,
+                                                    MOE_SORTING_MP_0(1, false),
+                                                    MOE_SORTING_MP_1(1, false),
+                                                    MOE_SORTING_MP_2(1, false),
+                                                    MOE_SORTING_MP_3(1, false));
+            return ave_time;
+        }
+    }
+    return -1;
+}
+
+int moe_sorting_get_workspace_size(int tokens, int num_experts)
+{
+    return ck_tile::moe_sorting_get_workspace_size(tokens, num_experts);
+}
diff --git a/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp b/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
index 5bda4d368a..b47ae9013b 100644
--- a/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
@@ -18,4 +18,10 @@ struct moe_sorting_args : public ck_tile::MoeSortingHostArgs
 {
 };
 
+// use below API before call moe_sorting() to indicate if need workspace or not
+// if return non zero, means need workspace, you need to allocate a GPU buffer
+// and set to moe_sorting_args.p_ws
+// NOTE: workspace size are required to clear zero before use the API
+int moe_sorting_get_workspace_size(int tokens, int num_experts);
 float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s);
+float moe_sorting_mp(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s);
diff --git a/example/ck_tile/15_fused_moe/fused_moe.hpp b/example/ck_tile/15_fused_moe/fused_moe.hpp
index 1f2246fa4a..b354d1d347 100644
--- a/example/ck_tile/15_fused_moe/fused_moe.hpp
+++ b/example/ck_tile/15_fused_moe/fused_moe.hpp
@@ -17,6 +17,9 @@ struct fused_moe_args
     const void* y_smooth_scale_ptr;    // [e, 1, n], smooth-quant-scale for 2nd gemm input
     const void* local_expert_mask_ptr; // [e], local_expert_mask_ptr for EP
     void* o_ptr;                       // [m, k], output token (no need to do zeroing)
+    void* ws_ptr;                      // size is moe_sorting_get_workspace_size()
+                                       // if return zero, then could be nullptr
+                                       // must be cleard before use
 
     const void* topk_ids_ptr;    // [tokens, topk]
     const void* topk_weight_ptr; // [tokens, topk]
diff --git a/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp b/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
index cf9ff2edba..f887d57aa9 100644
--- a/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
+++ b/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
@@ -19,19 +19,21 @@ float fused_moe(fused_moe_traits t, fused_moe_args a, const ck_tile::stream_conf
 
     auto t0 = fused_moesorting_trait{"int32", "fp32", t.local_expert_masking};
     auto a0 = fused_moesorting_args{
-        a.topk_ids_ptr,                              // const void* p_topk_ids;
-        a.topk_weight_ptr,                           // const void* p_weights;
-        a.local_expert_mask_ptr,                     // const void* p_local_expert_mask;
-        a.sorted_token_ids_ptr,                      // void* p_sorted_token_ids;
-        a.sorted_weight_ptr,                         // void* p_sorted_weights;
-        a.sorted_expert_ids_ptr,                     // void* p_sorted_expert_ids;
-        a.num_sorted_tiles_ptr,                      // void* p_total_tokens_post_pad;
-        a.o_ptr,                                     // void* p_moe_buf;
-        a.num_tokens,                                // index_t tokens;
-        a.block_m,                                   // index_t unit_size;
-        a.num_experts,                               // index_t num_experts;
-        a.topk,                                      // index_t topk;
-        a.num_tokens * a.stride_token * o_data_bytes // index_t moe_buf_bytes;
+        a.topk_ids_ptr,          // const void* p_topk_ids;
+        a.topk_weight_ptr,       // const void* p_weights;
+        a.local_expert_mask_ptr, // const void* p_local_expert_mask;
+        a.sorted_token_ids_ptr,  // void* p_sorted_token_ids;
+        a.sorted_weight_ptr,     // void* p_sorted_weights;
+        a.sorted_expert_ids_ptr, // void* p_sorted_expert_ids;
+        a.num_sorted_tiles_ptr,  // void* p_total_tokens_post_pad;
+        a.o_ptr,                 // void* p_moe_buf;
+        a.ws_ptr,                // void* p_ws;
+        a.num_tokens,            // index_t tokens;
+        a.block_m,               // index_t unit_size;
+        a.num_experts,           // index_t num_experts;
+        a.topk,                  // index_t topk;
+        static_cast<ck_tile::long_index_t>(a.num_tokens) * a.stride_token *
+            o_data_bytes // index_t moe_buf_bytes;
     };
 
     auto t1 = fused_moegemm_traits{t.prec_i,
diff --git a/example/ck_tile/15_fused_moe/main.cpp b/example/ck_tile/15_fused_moe/main.cpp
index 95adcd684b..cb93ce8907 100644
--- a/example/ck_tile/15_fused_moe/main.cpp
+++ b/example/ck_tile/15_fused_moe/main.cpp
@@ -371,6 +371,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
         ck_tile::DeviceMem num_sorted_tiles_buf(
             num_sorted_tiles_host.get_element_space_size_in_bytes());
 
+        // if return zero, means no need workspace, can set moe_sorting_args.p_ws to nullptr
+        ck_tile::index_t workspace_size = ck_tile::moe_sorting_get_workspace_size(tokens, experts);
+        ck_tile::DeviceMem moe_sorting_ws(workspace_size != 0 ? workspace_size : 0);
+        if(workspace_size != 0)
+            moe_sorting_ws.SetZero(); // note, clear here!!!!
+
         fused_moe_traits traits{prec_i,
                                 prec_w,
                                 prec_o,
@@ -394,6 +400,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                             local_expert_masking ? local_expert_mask_buf.GetDeviceBuffer()
                                                  : nullptr,
                             o_buf.GetDeviceBuffer(),
+                            workspace_size != 0 ? moe_sorting_ws.GetDeviceBuffer() : nullptr,
                             topk_ids_buf.GetDeviceBuffer(),
                             topk_weight_buf.GetDeviceBuffer(),
                             sorted_token_ids_buf.GetDeviceBuffer(),
diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.cpp b/example/ck_tile/16_batched_gemm/batched_gemm.cpp
index 286fe4201d..a0cd18ec74 100644
--- a/example/ck_tile/16_batched_gemm/batched_gemm.cpp
+++ b/example/ck_tile/16_batched_gemm/batched_gemm.cpp
@@ -18,16 +18,42 @@
 template <typename ALayout, typename BLayout, typename CLayout>
 float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stream_config& s)
 {
-    // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
-    constexpr bool kPadM = false;
-    constexpr bool kPadN = false;
-    constexpr bool kPadK = false;
-
-    constexpr int kBlockPerCu = 1;
-
-    // This part comes from the Codegen
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
+    // Memory friendly for Interwave scheduler
     constexpr ck_tile::index_t M_Tile = 128;
-    constexpr ck_tile::index_t N_Tile = 128;
+    constexpr ck_tile::index_t N_Tile = 32;
+    constexpr ck_tile::index_t K_Tile = 64;
+
+    constexpr ck_tile::index_t M_Warp = 4;
+    constexpr ck_tile::index_t N_Warp = 1;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 8;
+
+    constexpr bool DoubleSmemBuffer = false;
+#endif
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
+    // Compute friendly for Intrawave scheduler
+    constexpr ck_tile::index_t M_Tile = 256;
+    constexpr ck_tile::index_t N_Tile = 256;
+    constexpr ck_tile::index_t K_Tile = 64;
+
+    constexpr ck_tile::index_t M_Warp = 2;
+    constexpr ck_tile::index_t N_Warp = 2;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    constexpr bool DoubleSmemBuffer = false;
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
+    // Compute friendly for Intrawave scheduler
+    // Using the ping pong reader in the lds level
+    constexpr ck_tile::index_t M_Tile = 256;
+    constexpr ck_tile::index_t N_Tile = 256;
     constexpr ck_tile::index_t K_Tile = 32;
 
     constexpr ck_tile::index_t M_Warp = 2;
@@ -36,61 +62,232 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre
 
     constexpr ck_tile::index_t M_Warp_Tile = 32;
     constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 8;
+    constexpr ck_tile::index_t K_Warp_Tile = 16;
 
-    using CodegenGemmShape =
+    constexpr bool DoubleSmemBuffer = true;
+#endif
+
+    constexpr bool kPadM = false;
+    constexpr bool kPadN = false;
+    constexpr bool kPadK = false;
+
+    constexpr bool TransposeC = false;
+
+    constexpr int kBlockPerCu                         = 1;
+    constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+    constexpr ck_tile::index_t TileParitionerM01      = 4;
+
+    using GemmShape =
         ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
                                ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
                                ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+    using TilePartitioner = ck_tile::
+        GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
 
-    using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
+    using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<kPadM,
+                                                                 kPadN,
+                                                                 kPadK,
+                                                                 DoubleSmemBuffer,
+                                                                 ALayout,
+                                                                 BLayout,
+                                                                 CLayout,
+                                                                 TransposeC>;
+    using GemmPipelineProblem =
+        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
 
-    using CodegenGemmTraits =
-        ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
-    using CodegenPipelineProblem = ck_tile::
-        GemmPipelineProblem<ADataType, BDataType, AccDataType, CodegenGemmShape, CodegenGemmTraits>;
-    using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
-    using GemmEpilogue        = ck_tile::CShuffleEpilogue<
-        ck_tile::CShuffleEpilogueProblem<AccDataType,
-                                         CDataType,
-                                         CLayout,
-                                         CodegenPipelineProblem::kBlockSize,
-                                         TilePartitioner::MPerBlock,
-                                         TilePartitioner::NPerBlock,
-                                         M_Warp,
-                                         N_Warp,
-                                         M_Warp_Tile,
-                                         N_Warp_Tile,
-                                         K_Warp_Tile,
-                                         CodegenPipelineProblem::TransposeC>>;
-    // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
-    // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
-    using Kernel = ck_tile::BatchedGemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
+    using BaseGemmPipeline = UNIVERSAL_GEMM_PIPELINE<GemmPipelineProblem>;
 
-    auto kargs = Kernel::MakeKernelArgs(args);
+    const ck_tile::index_t k_grain     = args.k_batch * K_Tile;
+    const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * K_Tile;
+    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+    const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
 
-    const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch, args.batch_count);
-    constexpr dim3 blocks = Kernel::BlockSize();
+    float ave_time{0};
 
-    if(!Kernel::IsSupportedArgument(kargs))
+    const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
+        constexpr bool has_hot_loop_v = has_hot_loop_.value;
+        constexpr auto tail_number_v  = tail_number_.value;
+        constexpr auto scheduler      = GEMM_PIPELINE_SCHEDULER;
+
+        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                           BDataType,
+                                                                           AccDataType,
+                                                                           GemmShape,
+                                                                           GemmUniversalTraits,
+                                                                           scheduler,
+                                                                           has_hot_loop_v,
+                                                                           tail_number_v>;
+
+        using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             AccDataType,
+                                             CDataType,
+                                             CLayout,
+                                             GemmPipelineProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             M_Warp,
+                                             N_Warp,
+                                             M_Warp_Tile,
+                                             N_Warp_Tile,
+                                             K_Warp_Tile,
+                                             UniversalGemmProblem::TransposeC>>;
+        using Kernel = ck_tile::BatchedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+        auto kargs   = Kernel::MakeKernelArgs(args);
+
+        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch, args.batch_count);
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << GemmShape::GetName() << '\n'
+                      << "problem: " << GemmPipelineProblem::GetName() << '\n'
+                      << "pipeline: " << GemmPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+
+        ave_time = ck_tile::launch_kernel(
+            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        return ave_time;
+    };
+
+    if(has_hot_loop)
     {
-        throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
-    }
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
+        if(tail_num == ck_tile::TailNumber::Full)
+        {
+            Run(ck_tile::bool_constant<true>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+        }
+        else if(tail_num == ck_tile::TailNumber::Odd)
+        {
+            Run(ck_tile::bool_constant<true>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+        }
+        else if(tail_num == ck_tile::TailNumber::Even)
+        {
+            Run(ck_tile::bool_constant<true>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
+        }
+        else
+        {
+            std::ostringstream err;
+            err << "Incorrect tail_num for compv3 pipeline! Expected Full, Odd or Even, but got "
+                << tail_num << "\nPrefetchStages: " << BaseGemmPipeline::PrefetchStages
+                << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
+            throw std::runtime_error(err.str());
+        }
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
+        // Tail pipeline One to Seven
+        if(tail_num == ck_tile::TailNumber::One)
+        {
+            Run(ck_tile::bool_constant<true>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::One>{});
+        }
+        else if(tail_num == ck_tile::TailNumber::Full)
+        {
+            Run(ck_tile::bool_constant<true>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+        }
 
-    if(s.log_level_ > 0)
+        if constexpr(BaseGemmPipeline::PrefetchStages > 2)
+        {
+            if(tail_num == ck_tile::TailNumber::Two)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
+            }
+        }
+        if constexpr(BaseGemmPipeline::PrefetchStages > 3)
+        {
+            if(tail_num == ck_tile::TailNumber::Three)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
+            }
+        }
+        if constexpr(BaseGemmPipeline::PrefetchStages > 4)
+        {
+            if(tail_num == ck_tile::TailNumber::Four)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{});
+            }
+        }
+        if constexpr(BaseGemmPipeline::PrefetchStages > 5)
+        {
+            if(tail_num == ck_tile::TailNumber::Five)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{});
+            }
+        }
+        if constexpr(BaseGemmPipeline::PrefetchStages > 6)
+        {
+            if(tail_num == ck_tile::TailNumber::Six)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{});
+            }
+        }
+        if constexpr(BaseGemmPipeline::PrefetchStages > 7)
+        {
+            if(tail_num == ck_tile::TailNumber::Seven)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{});
+            }
+        }
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
+        if(tail_num == ck_tile::TailNumber::Three)
+        {
+            Run(ck_tile::bool_constant<true>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
+        }
+        else
+        {
+            Run(ck_tile::bool_constant<true>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
+        }
+#endif
+    }
+    else
     {
-        std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                  << "shape: " << CodegenGemmShape::GetName() << '\n'
-                  << "problem: " << CodegenPipelineProblem::GetName() << '\n'
-                  << "pipeline: " << CodegenGemmPipeline::GetName() << '\n'
-                  << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                  << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                  << std::endl;
+        if(tail_num == ck_tile::TailNumber::Full)
+        {
+            Run(ck_tile::bool_constant<false>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+        }
+        else if(tail_num == ck_tile::TailNumber::Odd)
+        {
+            Run(ck_tile::bool_constant<false>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+        }
+        else if(tail_num == ck_tile::TailNumber::Even)
+        {
+            Run(ck_tile::bool_constant<false>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+        }
+        std::ostringstream err;
+        err << "Incorrect tail_num for pipeline without hotloop, expected Full, Odd or Even, but "
+               "got "
+            << tail_num << "\n PrefetchStages: " << BaseGemmPipeline::PrefetchStages
+            << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
+        throw std::runtime_error(err.str());
     }
 
-    float ave_time = ck_tile::launch_kernel(
-        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-
     return ave_time;
 }
 
diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.hpp b/example/ck_tile/16_batched_gemm/batched_gemm.hpp
index 7b7e22160a..0999c7ad3b 100644
--- a/example/ck_tile/16_batched_gemm/batched_gemm.hpp
+++ b/example/ck_tile/16_batched_gemm/batched_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -9,6 +9,30 @@
 #include "ck_tile/host/kernel_launch.hpp"
 #include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp"
 
+#define CK_TILE_PIPELINE_COMPUTE_V3 1
+#define CK_TILE_PIPELINE_MEMORY 2
+#define CK_TILE_PIPELINE_COMPUTE_V4 3
+
+#ifndef CK_TILE_PIPELINE_DEFAULT
+#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V3
+#endif
+
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
+#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrMem
+#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrMem
+#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Interwave
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
+#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV3
+#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV3
+#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
+#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV4
+#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV4
+#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
+#else
+#error "unsupported CK_TILE_PIPELINE_DEFAULT value"
+#endif
+
 template <typename DataType>
 struct BatchedGemmTypeConfig;
 
@@ -32,19 +56,19 @@ using CDataType   = Types::CDataType;
 auto create_args(int argc, char* argv[])
 {
     ck_tile::ArgParser arg_parser;
-    arg_parser.insert("m", "256", "m dimension")
-        .insert("n", "128", "n dimension")
-        .insert("k", "128", "k dimension")
+    arg_parser.insert("m", "512", "m dimension")
+        .insert("n", "1024", "n dimension")
+        .insert("k", "2048", "k dimension")
         .insert("stride_a", "0", "Tensor A stride")
         .insert("stride_b", "0", "Tensor B stride")
         .insert("stride_c", "0", "Tensor C stride")
         .insert("a_layout", "R", "A tensor data layout - Row by default")
         .insert("b_layout", "C", "B tensor data layout - Row by default")
         .insert("c_layout", "R", "C tensor data layout - Row by default")
-        .insert("batch_stride_a", "32768", "Batch A stride")
-        .insert("batch_stride_b", "16384", "Batch B stride")
-        .insert("batch_stride_c", "32768", "Batch C stride")
-        .insert("batch_count", "16", "Batch count")
+        .insert("batch_stride_a", "1048576", "Batch A stride")
+        .insert("batch_stride_b", "2097152", "Batch B stride")
+        .insert("batch_stride_c", "524288", "Batch C stride")
+        .insert("batch_count", "8", "Batch count")
         .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
         .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
         .insert("warmup", "50", "number of iterations before benchmark the kernel")
diff --git a/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
index 1105304e3e..16a31e519a 100644
--- a/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
+++ b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
@@ -185,7 +185,6 @@ int run_batched_gemm_example_with_layouts(int argc,
                                                    kbatch,
                                                    n_warmup,
                                                    n_repeat);
-
     c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
     bool pass = true;
 
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
index 03d5818179..2a9903362d 100644
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
@@ -16,85 +16,9 @@
 #include "ck_tile/host.hpp"
 #include "grouped_gemm.hpp"
 
-namespace {
-
-struct GroupedGemmKernelParam
-{
-    static const bool kPadM = false;
-    static const bool kPadN = false;
-    static const bool kPadK = false;
-
-    static const int kBlockPerCu         = 1;
-    static const ck_tile::index_t M_Tile = 128;
-    static const ck_tile::index_t N_Tile = 128;
-    static const ck_tile::index_t K_Tile = 32;
-
-    static const ck_tile::index_t M_Warp = 2;
-    static const ck_tile::index_t N_Warp = 2;
-    static const ck_tile::index_t K_Warp = 1;
-
-    static const ck_tile::index_t M_Warp_Tile = 32;
-    static const ck_tile::index_t N_Warp_Tile = 32;
-    static const ck_tile::index_t K_Warp_Tile = 8;
-};
-
-using CodegenGemmShape =
-    ck_tile::TileGemmShape<ck_tile::sequence<GroupedGemmKernelParam::M_Tile,
-                                             GroupedGemmKernelParam::N_Tile,
-                                             GroupedGemmKernelParam::K_Tile>,
-                           ck_tile::sequence<GroupedGemmKernelParam::M_Warp,
-                                             GroupedGemmKernelParam::N_Warp,
-                                             GroupedGemmKernelParam::K_Warp>,
-                           ck_tile::sequence<GroupedGemmKernelParam::M_Warp_Tile,
-                                             GroupedGemmKernelParam::N_Warp_Tile,
-                                             GroupedGemmKernelParam::K_Warp_Tile>>;
-
-using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
-
-template <typename ALayout, typename BLayout, typename CLayout>
-using CodegenGemmTraits = ck_tile::TileGemmTraits<GroupedGemmKernelParam::kPadM,
-                                                  GroupedGemmKernelParam::kPadN,
-                                                  GroupedGemmKernelParam::kPadK,
-                                                  ALayout,
-                                                  BLayout,
-                                                  CLayout>;
-
-template <typename ALayout, typename BLayout, typename CLayout>
-using CodegenPipelineProblem =
-    ck_tile::GemmPipelineProblem<ADataType,
-                                 BDataType,
-                                 AccDataType,
-                                 CodegenGemmShape,
-                                 CodegenGemmTraits<ALayout, BLayout, CLayout>>;
-
-template <typename ALayout, typename BLayout, typename CLayout>
-using CodegenGemmPipeline =
-    ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem<ALayout, BLayout, CLayout>>;
-
-template <typename ALayout, typename BLayout, typename CLayout>
-using GemmEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
-    AccDataType,
-    CDataType,
-    CLayout,
-    CodegenPipelineProblem<ALayout, BLayout, CLayout>::kBlockSize,
-    TilePartitioner::MPerBlock,
-    TilePartitioner::NPerBlock,
-    GroupedGemmKernelParam::M_Warp,
-    GroupedGemmKernelParam::N_Warp,
-    GroupedGemmKernelParam::M_Warp_Tile,
-    GroupedGemmKernelParam::N_Warp_Tile,
-    GroupedGemmKernelParam::K_Warp_Tile,
-    CodegenPipelineProblem<ALayout, BLayout, CLayout>::TransposeC>>;
-
-template <typename ALayout, typename BLayout, typename CLayout>
-using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner,
-                                          CodegenGemmPipeline<ALayout, BLayout, CLayout>,
-                                          GemmEpilogue<ALayout, BLayout, CLayout>>;
-}; // namespace
-
 std::size_t get_workspace_size(const std::vector<grouped_gemm_kargs>& gemm_descs)
 {
-    return ::Kernel<std::nullptr_t, std::nullptr_t, std::nullptr_t>::GetWorkSpaceSize(gemm_descs);
+    return gemm_descs.size() * sizeof(ck_tile::GemmTransKernelArg);
 }
 
 template <typename ALayout, typename BLayout, typename CLayout>
@@ -102,37 +26,265 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
                    const ck_tile::stream_config& s,
                    void* p_workspace_)
 {
-    using GroupedGemmKernel = ::Kernel<ALayout, BLayout, CLayout>;
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
+    // Memory friendly for Interwave scheduler
+    constexpr ck_tile::index_t M_Tile = 128;
+    constexpr ck_tile::index_t N_Tile = 32;
+    constexpr ck_tile::index_t K_Tile = 64;
 
-    auto arguments = GroupedGemmKernel::MakeKargs(gemm_descs);
+    constexpr ck_tile::index_t M_Warp = 4;
+    constexpr ck_tile::index_t N_Warp = 1;
+    constexpr ck_tile::index_t K_Warp = 1;
 
-    const dim3 grids      = GroupedGemmKernel::GridSize(gemm_descs);
-    constexpr dim3 blocks = GroupedGemmKernel::BlockSize();
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 8;
 
-    ck_tile::hip_check_error(hipMemcpyWithStream(
-        p_workspace_,
-        arguments.data(),
-        arguments.size() * sizeof(typename GroupedGemmKernel::GemmTransKernelArg),
-        hipMemcpyHostToDevice,
-        s.stream_id_));
+    constexpr bool DoubleSmemBuffer = false;
+#endif
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
+    // Compute friendly for Intrawave scheduler
+    constexpr ck_tile::index_t M_Tile = 256;
+    constexpr ck_tile::index_t N_Tile = 256;
+    constexpr ck_tile::index_t K_Tile = 64;
 
-    if(s.log_level_ > 0)
+    constexpr ck_tile::index_t M_Warp = 2;
+    constexpr ck_tile::index_t N_Warp = 2;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    constexpr bool DoubleSmemBuffer = false;
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
+    // Compute friendly for Intrawave scheduler
+    // Using the ping pong reader in the lds level
+    constexpr ck_tile::index_t M_Tile = 256;
+    constexpr ck_tile::index_t N_Tile = 256;
+    constexpr ck_tile::index_t K_Tile = 32;
+
+    constexpr ck_tile::index_t M_Warp = 2;
+    constexpr ck_tile::index_t N_Warp = 2;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    constexpr bool DoubleSmemBuffer = true;
+#endif
+
+    constexpr bool kPadM = false;
+    constexpr bool kPadN = false;
+    constexpr bool kPadK = false;
+
+    constexpr bool TransposeC = false;
+
+    constexpr int kBlockPerCu                         = 1;
+    constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+    constexpr ck_tile::index_t TileParitionerM01      = 4;
+
+    using GemmShape =
+        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+    using TilePartitioner = ck_tile::
+        GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
+
+    using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<kPadM,
+                                                                 kPadN,
+                                                                 kPadK,
+                                                                 DoubleSmemBuffer,
+                                                                 ALayout,
+                                                                 BLayout,
+                                                                 CLayout,
+                                                                 TransposeC>;
+    using GemmPipelineProblem =
+        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+
+    using BaseGemmPipeline = UNIVERSAL_GEMM_PIPELINE<GemmPipelineProblem>;
+
+    const ck_tile::index_t k_grain     = gemm_descs[0].k_batch * K_Tile;
+    const ck_tile::index_t K_split     = (gemm_descs[0].K + k_grain - 1) / k_grain * K_Tile;
+    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+    const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+
+    float ave_time{0};
+
+    const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
+        constexpr bool has_hot_loop_v = has_hot_loop_.value;
+        constexpr auto tail_number_v  = tail_number_.value;
+        constexpr auto scheduler      = GEMM_PIPELINE_SCHEDULER;
+
+        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                           BDataType,
+                                                                           AccDataType,
+                                                                           GemmShape,
+                                                                           GemmUniversalTraits,
+                                                                           scheduler,
+                                                                           has_hot_loop_v,
+                                                                           tail_number_v>;
+
+        using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             AccDataType,
+                                             CDataType,
+                                             CLayout,
+                                             GemmPipelineProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             M_Warp,
+                                             N_Warp,
+                                             M_Warp_Tile,
+                                             N_Warp_Tile,
+                                             K_Warp_Tile,
+                                             UniversalGemmProblem::TransposeC>>;
+        using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+        auto kargs   = Kernel::MakeKargs(gemm_descs);
+
+        const dim3 grids      = Kernel::GridSize(gemm_descs);
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        ck_tile::hip_check_error(hipMemcpyWithStream(p_workspace_,
+                                                     kargs.data(),
+                                                     get_workspace_size(gemm_descs),
+                                                     hipMemcpyHostToDevice,
+                                                     s.stream_id_));
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel: " << Kernel::GetName() << " with args:"
+                      << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+
+        ave_time = ck_tile::launch_kernel(
+            s,
+            ck_tile::make_kernel<blocks.x, kBlockPerCu>(
+                Kernel{},
+                grids,
+                blocks,
+                0,
+                ck_tile::cast_pointer_to_constant_address_space(p_workspace_),
+                gemm_descs.size()));
+        return ave_time;
+    };
+
+    if(has_hot_loop)
     {
-        std::cout << "Launching kernel: " << GroupedGemmKernel::GetName() << " with args:"
-                  << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                  << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                  << std::endl;
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
+        if(tail_num == ck_tile::TailNumber::Full)
+        {
+            Run(ck_tile::bool_constant<true>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+        }
+        else if(tail_num == ck_tile::TailNumber::Odd)
+        {
+            Run(ck_tile::bool_constant<true>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+        }
+        else if(tail_num == ck_tile::TailNumber::Even)
+        {
+            Run(ck_tile::bool_constant<true>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
+        }
+        else
+        {
+            std::ostringstream err;
+            err << "Incorrect tail_num for compv3 pipeline! Expected Full, Odd or Even, but got "
+                << tail_num << "\nPrefetchStages: " << BaseGemmPipeline::PrefetchStages
+                << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
+            throw std::runtime_error(err.str());
+        }
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
+        // Tail pipeline One to Seven
+        if(tail_num == ck_tile::TailNumber::One)
+        {
+            Run(ck_tile::bool_constant<true>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::One>{});
+        }
+        else if(tail_num == ck_tile::TailNumber::Full)
+        {
+            Run(ck_tile::bool_constant<true>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+        }
+
+        if constexpr(BaseGemmPipeline::PrefetchStages > 2)
+        {
+            if(tail_num == ck_tile::TailNumber::Two)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
+            }
+        }
+        if constexpr(BaseGemmPipeline::PrefetchStages > 3)
+        {
+            if(tail_num == ck_tile::TailNumber::Three)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
+            }
+        }
+        if constexpr(BaseGemmPipeline::PrefetchStages > 4)
+        {
+            if(tail_num == ck_tile::TailNumber::Four)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{});
+            }
+        }
+        if constexpr(BaseGemmPipeline::PrefetchStages > 5)
+        {
+            if(tail_num == ck_tile::TailNumber::Five)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{});
+            }
+        }
+        if constexpr(BaseGemmPipeline::PrefetchStages > 6)
+        {
+            if(tail_num == ck_tile::TailNumber::Six)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{});
+            }
+        }
+        if constexpr(BaseGemmPipeline::PrefetchStages > 7)
+        {
+            if(tail_num == ck_tile::TailNumber::Seven)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{});
+            }
+        }
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
+        if(tail_num == ck_tile::TailNumber::Three)
+        {
+            Run(ck_tile::bool_constant<true>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
+        }
+        else
+        {
+            Run(ck_tile::bool_constant<true>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
+        }
+#endif
+    }
+    else
+    {
+        std::ostringstream err;
+        err << "Incorrect tail_num for pipeline without hotloop, expected Full, Odd or Even, but "
+            << "got " << tail_num << "\n PrefetchStages: " << BaseGemmPipeline::PrefetchStages
+            << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
+        throw std::runtime_error(err.str());
     }
 
-    float ave_time =
-        ck_tile::launch_kernel(s,
-                               ck_tile::make_kernel<blocks.x, GroupedGemmKernelParam::kBlockPerCu>(
-                                   GroupedGemmKernel{},
-                                   grids,
-                                   blocks,
-                                   0,
-                                   ck_tile::cast_pointer_to_constant_address_space(p_workspace_),
-                                   gemm_descs.size()));
     return ave_time;
 }
 
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
index 2ffef95196..4fec329c2f 100644
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -9,11 +9,35 @@
 #include "ck_tile/host/kernel_launch.hpp"
 #include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
 
+#define CK_TILE_PIPELINE_COMPUTE_V3 1
+#define CK_TILE_PIPELINE_MEMORY 2
+#define CK_TILE_PIPELINE_COMPUTE_V4 3
+
+#ifndef CK_TILE_PIPELINE_DEFAULT
+#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V3
+#endif
+
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
+#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrMem
+#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrMem
+#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Interwave
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
+#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV3
+#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV3
+#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
+#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV4
+#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV4
+#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
+#else
+#error "unsupported CK_TILE_PIPELINE_DEFAULT value"
+#endif
+
 template <typename DataType>
-struct GemmBasicTypeConfig;
+struct GemmTypeConfig;
 
 template <>
-struct GemmBasicTypeConfig<ck_tile::half_t>
+struct GemmTypeConfig<ck_tile::half_t>
 {
     using ADataType   = ck_tile::half_t;
     using BDataType   = ck_tile::half_t;
@@ -21,7 +45,7 @@ struct GemmBasicTypeConfig<ck_tile::half_t>
     using AccDataType = float;
 };
 
-using Types = GemmBasicTypeConfig<ck_tile::half_t>;
+using Types = GemmTypeConfig<ck_tile::half_t>;
 
 // Specific type aliases for easy access
 using ADataType   = Types::ADataType;
@@ -29,7 +53,7 @@ using BDataType   = Types::BDataType;
 using AccDataType = Types::AccDataType;
 using CDataType   = Types::CDataType;
 
-using grouped_gemm_kargs = ck_tile::GroupedGemmHostArgs;
+using grouped_gemm_kargs = ck_tile::GemmHostArgs;
 
 auto create_args(int argc, char* argv[])
 {
@@ -46,7 +70,7 @@ auto create_args(int argc, char* argv[])
         .insert("validate", "1", "0. No validation, 1. Validation on CPU.")
         .insert("warmup", "10", "number of iterations before benchmark the kernel.")
         .insert("repeat", "100", "number of iterations to benchmark the kernel.")
-        .insert("group_count", "16", "group count.");
+        .insert("group_count", "8", "group count.");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
diff --git a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
index 080ea818c9..f068510d26 100644
--- a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
@@ -101,8 +101,8 @@ int run_grouped_gemm_example_with_layouts(int argc,
         for(int i = 0; i < group_count; i++)
         {
             Ms.push_back(256 + 256 * i);
-            Ns.push_back(128 + 128 * i);
-            Ks.push_back(128 + 64 * i);
+            Ns.push_back(256 + 512 * i);
+            Ks.push_back(256 + 64 * i);
 
             stride_As.push_back(Ks[i]);
             stride_Bs.push_back(Ks[i]);
@@ -169,7 +169,10 @@ int run_grouped_gemm_example_with_layouts(int argc,
         const void* p_b = b_k_n_dev_buf[i]->GetDeviceBuffer();
         void* p_c       = c_m_n_dev_buf[i]->GetDeviceBuffer();
 
-        gemm_descs.push_back({p_a, p_b, p_c, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]});
+        // TODO Add support for kbatch > 1 in grouped gemm
+        static constexpr ck_tile::index_t k_batch = 1;
+        gemm_descs.push_back(
+            {p_a, p_b, p_c, k_batch, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]});
     }
 
     invoke_gemm<ALayout, BLayout, CLayout>(warmup, repeat, group_count, gemm_descs);
diff --git a/example/ck_tile/18_flatmm/CMakeLists.txt b/example/ck_tile/18_flatmm/CMakeLists.txt
new file mode 100644
index 0000000000..9fbe65e3a7
--- /dev/null
+++ b/example/ck_tile/18_flatmm/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_executable(tile_example_flatmm_basic EXCLUDE_FROM_ALL flatmm_basic.cpp)
+
+set(EXAMPLE_FLATMM_COMPILE_OPTIONS)
+# list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+# list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-unused-variable -Wno-unused-parameter)
+# list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-unused-local-typedef)
+target_compile_options(tile_example_flatmm_basic PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
diff --git a/example/ck_tile/18_flatmm/README.md b/example/ck_tile/18_flatmm/README.md
new file mode 100644
index 0000000000..beaac785fc
--- /dev/null
+++ b/example/ck_tile/18_flatmm/README.md
@@ -0,0 +1,35 @@
+# FLATMM Matrix Multiplication
+
+This folder contains example for FLATMM using ck_tile tile-programming implementation. Currently, it only supports the basic feature of the CK Tile FLATMM, but creates the placeholders for the future support on different FLATMM pipeline and different FLATMM modules. In the near future, we will gradually migrate all the FLATMM features from old CK to CK Tile.
+
+## build
+```
+# in the root of ck_tile
+mkdir build && cd build
+# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
+sh ../script/cmake-ck-dev.sh  ../ <arch>
+# The basic pipeline method on the flatmm calculation
+make tile_example_flatmm_basic -j
+```
+This will result in an executable `build/bin/tile_example_flatmm_basic`
+
+## example
+```
+args:
+          -b    batch size (default:1)
+          -m    m dimension (default:1024)
+          -n    n dimension (default:2048)
+          -k    k dimension (default:64)
+   -a_layout    Tensor A data layout (default: R)
+   -b_layout    Tensor B data layout (default: R)
+   -c_layout    Tensor C data layout (default: R)
+   -stride_a    Tensor A stride (default:0)
+   -stride_b    Tensor B stride (default:0)
+   -stride_c    Tensor C stride (default:0)
+          -v    0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:2)
+          -e    Absolute error tolerance (default:1e-5)
+       -prec    data type. fp16/bf16/fp8/bf8 (default:fp16)
+     -warmup    number of iterations before benchmark the kernel (default:10)
+     -repeat    number of iterations to benchmark the kernel (default:100)
+      -timer    gpu:gpu timer, cpu:cpu timer (default:gpu)
+```
diff --git a/example/ck_tile/18_flatmm/flatmm_basic.cpp b/example/ck_tile/18_flatmm/flatmm_basic.cpp
new file mode 100644
index 0000000000..05d0c73b7e
--- /dev/null
+++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/host.hpp"
+#include "flatmm_basic.hpp"
+
+template <typename ALayout, typename BLayout, typename CLayout>
+float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_config& s)
+{
+    // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
+    constexpr bool kPadM = false;
+    constexpr bool kPadN = false;
+    constexpr bool kPadK = false;
+
+    constexpr int kBlockPerCu = 2;
+
+    // This part comes from the Codegen
+    constexpr ck_tile::index_t M_Tile = 128;
+    constexpr ck_tile::index_t N_Tile = 128;
+    constexpr ck_tile::index_t K_Tile = 64;
+
+    constexpr ck_tile::index_t M_Warp = 1;
+    constexpr ck_tile::index_t N_Warp = 4;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    using CodegenFlatmmShape =
+        ck_tile::TileFlatmmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                                 ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                                 ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+    using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenFlatmmShape>;
+
+    using CodegenGemmTraits =
+        ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+    using CodegenPipelineProblem = ck_tile::GemmPipelineProblem<ADataType,
+                                                                BDataType,
+                                                                AccDataType,
+                                                                CodegenFlatmmShape,
+                                                                CodegenGemmTraits>;
+    using GemmEpilogue           = ck_tile::CShuffleEpilogue<
+        ck_tile::CShuffleEpilogueProblem<ADataType,
+                                         BDataType,
+                                         AccDataType,
+                                         CDataType,
+                                         CLayout,
+                                         CodegenPipelineProblem::kBlockSize,
+                                         TilePartitioner::MPerBlock,
+                                         TilePartitioner::NPerBlock,
+                                         M_Warp,
+                                         N_Warp,
+                                         M_Warp_Tile,
+                                         N_Warp_Tile,
+                                         K_Warp_Tile,
+                                         CodegenPipelineProblem::TransposeC>>;
+
+    using CodegenFlatmmPolicy = ck_tile::UniversalFlatmmPipelineAgBgCrPolicy;
+    using CodegenFlatmmPipeline =
+        ck_tile::FlatmmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem, CodegenFlatmmPolicy>;
+
+    // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
+    // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
+    using Kernel = ck_tile::FlatmmKernel<TilePartitioner, CodegenFlatmmPipeline, GemmEpilogue>;
+
+    auto kargs = Kernel::MakeKernelArgs(args);
+
+    const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
+    constexpr dim3 blocks = Kernel::BlockSize();
+
+    if(!Kernel::IsSupportedArgument(kargs))
+    {
+        throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+    }
+
+    if(s.log_level_ > 0)
+    {
+        std::cout << "Launching kernel with args:"
+                  << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                  << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                  << std::endl;
+    }
+
+    float ave_time = ck_tile::launch_kernel(
+        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+    return ave_time;
+}
+
+#include "run_flatmm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_flatmm_example(argc, argv); }
diff --git a/example/ck_tile/18_flatmm/flatmm_basic.hpp b/example/ck_tile/18_flatmm/flatmm_basic.hpp
new file mode 100644
index 0000000000..355ac45ebe
--- /dev/null
+++ b/example/ck_tile/18_flatmm/flatmm_basic.hpp
@@ -0,0 +1,100 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/flatmm.hpp"
+#include "ck_tile/ops/gemm.hpp"
+
+#define CK_TILE_PIPELINE_COMPUTE 1
+#define CK_TILE_PIPELINE_MEMORY 2
+
+#ifndef CK_TILE_PIPELINE_DEFAULT
+#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE
+#endif
+
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
+#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrMem
+#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrMem
+#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Interwave
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE)
+#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV3
+#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV3
+#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
+#else
+#error "unsupported CK_TILE_PIPELINE_DEFAULT value"
+#endif
+
+template <typename DataType>
+struct GemmBasicTypeConfig;
+
+template <>
+struct GemmBasicTypeConfig<ck_tile::half_t>
+{
+    using ADataType   = ck_tile::half_t;
+    using BDataType   = ck_tile::half_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+    // ToDo: Add more bias config to support different categories of GEMM.
+};
+
+template <typename T>
+struct DataTypeTraits;
+
+template <>
+struct DataTypeTraits<float>
+{
+    static constexpr const char* name = "fp32";
+};
+
+template <>
+struct DataTypeTraits<double>
+{
+    static constexpr const char* name = "fp64";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::half_t>
+{
+    static constexpr const char* name = "fp16";
+};
+
+using Types = GemmBasicTypeConfig<ck_tile::half_t>;
+
+// Specific type aliases for easy access
+using ADataType   = Types::ADataType;
+using BDataType   = Types::BDataType;
+using AccDataType = Types::AccDataType;
+using CDataType   = Types::CDataType;
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "256", "m dimension")
+        .insert("n", "256", "n dimension")
+        .insert("k", "128", "k dimension")
+        .insert("a_layout", "R", "A tensor data layout - Row by default")
+        .insert("b_layout", "C", "B tensor data layout - Row by default")
+        .insert("c_layout", "R", "C tensor data layout - Row by default")
+        .insert("stride_a", "0", "Tensor A stride")
+        .insert("stride_b", "0", "Tensor B stride")
+        .insert("stride_c", "0", "Tensor C stride")
+        .insert("v", "1", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
+        .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
+        .insert("warmup", "50", "number of iterations before benchmark the kernel")
+        .insert("repeat", "100", "number of iterations to benchmark the kernel")
+        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
+        .insert("split_k", "1", "splitK value");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+// host API
+float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_config& s);
diff --git a/example/ck_tile/18_flatmm/run_flatmm_example.inc b/example/ck_tile/18_flatmm/run_flatmm_example.inc
new file mode 100644
index 0000000000..864d888074
--- /dev/null
+++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+template <typename Layout>
+static constexpr inline auto is_row_major(Layout layout_)
+{
+    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
+                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
+}
+
+// mfma_type, 0:32x32, 1:16x16
+template <typename T>
+auto shuffle_b(const ck_tile::HostTensor<T>& t, std::string mfma_dtype, int mfma_type = 0)
+{
+    assert(t.get_lengths().size() == 2);
+    int n_ = t.get_lengths()[1];
+    int k_ = t.get_lengths()[0];
+
+    if((mfma_dtype == "bf16" || mfma_dtype == "fp16") && mfma_type == 0)
+    {
+        ck_tile::HostTensor<T> t_view({n_ / 32, 32, k_ / 16, 2, 8});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+    }
+    else if((mfma_dtype == "bf16" || mfma_dtype == "fp16") && mfma_type == 1)
+    {
+        ck_tile::HostTensor<T> t_view({n_ / 16, 16, k_ / 32, 4, 8});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+    }
+    else if((mfma_dtype == "int8" || mfma_dtype == "fp8") && mfma_type == 0)
+    {
+        ck_tile::HostTensor<T> t_view({n_ / 32, 32, k_ / 32, 2, 16});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+    }
+    else if((mfma_dtype == "int8" || mfma_dtype == "fp8") && mfma_type == 1)
+    {
+        ck_tile::HostTensor<T> t_view({n_ / 16, 16, k_ / 64, 4, 16});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+    }
+    return t;
+}
+
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeType =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
+    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
+        max_accumulated_value, kbatch);
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+template <typename ALayout, typename BLayout, typename CLayout>
+float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
+                    ck_tile::DeviceMem& b_shuffle_dev_buf,
+                    ck_tile::DeviceMem& c_dev_buf,
+                    ck_tile::index_t M,
+                    ck_tile::index_t N,
+                    ck_tile::index_t K,
+                    ck_tile::index_t stride_A,
+                    ck_tile::index_t stride_B,
+                    ck_tile::index_t stride_C,
+                    ck_tile::index_t kbatch,
+                    int n_warmup,
+                    int n_repeat)
+{
+    ck_tile::FlatmmHostArgs args;
+    args.a_ptr         = a_dev_buf.GetDeviceBuffer();
+    args.b_shuffle_ptr = b_shuffle_dev_buf.GetDeviceBuffer();
+    args.c_ptr         = c_dev_buf.GetDeviceBuffer();
+
+    args.k_batch  = kbatch;
+    args.M        = M;
+    args.N        = N;
+    args.K        = K;
+    args.stride_A = stride_A;
+    args.stride_B = stride_B;
+    args.stride_C = stride_C;
+
+    float ave_time = flatmm_calc<ALayout, BLayout, CLayout>(
+        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_byte =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+    std::cout << "Run Flatmm kernel with M =" << M << " N =" << N << " K =" << K
+              << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C
+              << " : " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << std::endl;
+
+    return ave_time;
+}
+
+template <typename ALayout, typename BLayout, typename CLayout>
+int run_flatmm_example_with_layouts(int argc,
+                                    char* argv[],
+                                    const ALayout a_layout                  = ALayout{},
+                                    const BLayout b_layout                  = BLayout{},
+                                    [[maybe_unused]] const CLayout c_layout = CLayout{})
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    ck_tile::index_t M = arg_parser.get_int("m");
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t K = arg_parser.get_int("k");
+
+    ck_tile::index_t stride_A = arg_parser.get_int("stride_a");
+    ck_tile::index_t stride_B = arg_parser.get_int("stride_b");
+    ck_tile::index_t stride_C = arg_parser.get_int("stride_c");
+
+    ck_tile::index_t kbatch = arg_parser.get_int("split_k");
+    int n_warmup            = arg_parser.get_int("warmup");
+    int n_repeat            = arg_parser.get_int("repeat");
+
+    stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
+    stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
+    stride_C = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{}));
+
+    ck_tile::HostTensor<ADataType> a_host(
+        ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(a_layout)));
+    ck_tile::HostTensor<BDataType> b_origin_host(
+        ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout)));
+    ck_tile::HostTensor<CDataType> c_rslt_host(
+        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+
+    // TODO: add different init types
+    ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_host);
+    ck_tile::FillUniformDistribution<BDataType>{-.5f, .5f}(b_origin_host);
+
+    ck_tile::DeviceMem a_dev_buf(a_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem c_dev_buf(c_rslt_host.get_element_space_size_in_bytes());
+
+    a_dev_buf.ToDevice(a_host.data());
+    c_rslt_host.SetZero();
+
+    // do pre-shuffle
+    std::string mfma                              = arg_parser.get_str("prec");
+    ck_tile::HostTensor<BDataType> b_shuffle_host = shuffle_b(b_origin_host, mfma, 0);
+    ck_tile::DeviceMem b_shuffle_dev_buf(b_shuffle_host.get_element_space_size_in_bytes());
+    b_shuffle_dev_buf.ToDevice(b_shuffle_host.data());
+
+    invoke_flatmm<ALayout, BLayout, CLayout>(a_dev_buf,
+                                             b_shuffle_dev_buf,
+                                             c_dev_buf,
+                                             M,
+                                             N,
+                                             K,
+                                             stride_A,
+                                             stride_B,
+                                             stride_C,
+                                             kbatch,
+                                             n_warmup,
+                                             n_repeat);
+
+    c_dev_buf.FromDevice(c_rslt_host.data());
+    bool pass = true;
+
+    if(arg_parser.get_int("v") == 1)
+    {
+        ck_tile::HostTensor<CDataType> c_ref_host(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+        c_ref_host.SetZero();
+
+        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
+            a_host, b_origin_host, c_ref_host);
+        const float max_accumulated_value =
+            *std::max_element(c_ref_host.mData.begin(), c_ref_host.mData.end());
+        const auto rtol_atol = calculate_rtol_atol(K, kbatch, max_accumulated_value);
+        pass                 = ck_tile::check_err(c_rslt_host,
+                                  c_ref_host,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The CPU veification result is:" << (pass ? "correct" : "fail") << std::endl;
+    }
+    else if(arg_parser.get_int("v") == 2)
+    {
+        ck_tile::DeviceMem b_origin_dev_buf(b_origin_host.get_element_space_size_in_bytes());
+        b_origin_dev_buf.ToDevice(b_origin_host.data());
+
+        ck_tile::HostTensor<CDataType> c_gpu_ref_host(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+        ck_tile::DeviceMem c_gpu_ref_dev_buf(c_gpu_ref_host.get_element_space_size_in_bytes());
+        c_gpu_ref_host.SetZero();
+        c_gpu_ref_dev_buf.SetZero();
+
+        ADataType* d_A;
+        BDataType* d_B;
+        CDataType* d_C;
+
+        ck_tile::hip_check_error(hipMalloc(&d_A, M * K * sizeof(ADataType)));
+        ck_tile::hip_check_error(hipMalloc(&d_B, N * K * sizeof(BDataType)));
+        ck_tile::hip_check_error(hipMalloc(&d_C, M * N * sizeof(CDataType)));
+
+        ck_tile::hip_check_error(hipMemcpy(
+            d_A, a_dev_buf.GetDeviceBuffer(), M * K * sizeof(ADataType), hipMemcpyHostToDevice));
+        ck_tile::hip_check_error(hipMemcpy(d_B,
+                                           b_origin_dev_buf.GetDeviceBuffer(),
+                                           N * K * sizeof(BDataType),
+                                           hipMemcpyHostToDevice));
+
+        ck_tile::reference_gemm_gpu<ADataType,
+                                    BDataType,
+                                    AccDataType,
+                                    CDataType,
+                                    ALayout,
+                                    BLayout,
+                                    CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C);
+
+        ck_tile::hip_check_error(hipMemcpy(c_gpu_ref_dev_buf.GetDeviceBuffer(),
+                                           d_C,
+                                           M * N * sizeof(CDataType),
+                                           hipMemcpyDeviceToHost));
+
+        ck_tile::hip_check_error(hipFree(d_A));
+        ck_tile::hip_check_error(hipFree(d_B));
+        ck_tile::hip_check_error(hipFree(d_C));
+
+        c_gpu_ref_dev_buf.FromDevice(c_gpu_ref_host.data());
+        const float max_accumulated_value =
+            *std::max_element(c_gpu_ref_host.mData.begin(), c_gpu_ref_host.mData.end());
+        const auto rtol_atol = calculate_rtol_atol(K, kbatch, max_accumulated_value);
+        pass                 = ck_tile::check_err(c_rslt_host,
+                                  c_gpu_ref_host,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The GPU veification result is: " << (pass ? "correct" : "fail") << std::endl;
+    }
+
+    return pass;
+}
+
+int run_flatmm_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    using Row = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+    std::string a_layout = arg_parser.get_str("a_layout");
+    std::string b_layout = arg_parser.get_str("b_layout");
+
+    if(a_layout == "R" && b_layout == "C")
+    {
+        return run_flatmm_example_with_layouts(argc, argv, Row{}, Col{}, Row{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!");
+    }
+}
diff --git a/example/ck_tile/18_flatmm/script/smoke_test_basic.sh b/example/ck_tile/18_flatmm/script/smoke_test_basic.sh
new file mode 100755
index 0000000000..a3fc61cc31
--- /dev/null
+++ b/example/ck_tile/18_flatmm/script/smoke_test_basic.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+EXE="$(find . -name tile_example_flatmm_basic -type f | head -n 1)"
+KNAME=1
+
+export CK_WARMUP=0
+export CK_REPEAT=1
+
+COMMON_ARGS='-v=2 -warmup=0 -repeat=1'
+
+run_tests() {
+    for m in 128 1024; do
+        for n in 128 2048; do
+            for k in 128 4096; do
+
+                $EXE -m=$m -n=$n -k=$k -stride_a=0 -stride_b=0 -stride_c=0 -prec=$1 $COMMON_ARGS
+                if [ $? -eq 0 ]; then
+                    echo "Success: Test with m=$m, n=$n, k=$k executed successfully."
+                else
+                    echo "Error: Test with m=$m, n=$n, k=$k failed to execute properly."
+                    # Optionally, exit or break if you need to halt further execution
+                    # exit 1
+                fi
+
+            done
+        done
+    done
+}
+
+set -x
+
+run_tests "bf16"
+run_tests "fp16"
+
+set +x
diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt
index 7f4ba2ed35..88efe0d8d9 100644
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -17,4 +17,5 @@ add_subdirectory(14_moe_smoothquant)
 add_subdirectory(15_fused_moe)
 add_subdirectory(16_batched_gemm)
 add_subdirectory(17_grouped_gemm)
+add_subdirectory(18_flatmm)
 add_subdirectory(35_batched_transpose)
diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index c8d1c20f4c..e38f166c1a 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -6,15 +6,10 @@
 #include "ck/config.h"
 
 #if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
-#include "ck/utility/env.hpp"
 #ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS
 #include "hip/hip_runtime.h"
 #include "hip/hip_fp16.h"
 #endif
-
-// environment variable to enable logging:
-// export CK_LOGGING=ON or CK_LOGGING=1 or CK_LOGGING=ENABLED
-CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
 #endif
 // to do: add various levels of logging with CK_LOG_LEVEL
 
@@ -55,11 +50,10 @@ CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
 #endif
 
 // define general macros for various architectures
-#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
-    defined(__gfx942__) || defined(__gfx950__)
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)
 #define __gfx9__
 #endif
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
+#if defined(__gfx942__) || defined(__gfx950__)
 #define __gfx94__
 #endif
 #if defined(__gfx1010__) || defined(__gfx1011__) || defined(__gfx1012__)
@@ -71,7 +65,8 @@ CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
 #define __gfx103__
 #endif
 #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || \
-    defined(__gfx1103__) || defined(__gfx11_generic__)
+    defined(__gfx1103__) || defined(__gfx1150__) || defined(__gfx1151__) || \
+    defined(__gfx1152__) || defined(__gfx11_generic__)
 #define __gfx11__
 #endif
 #if defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx12_generic__)
@@ -130,7 +125,7 @@ CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
 // buffer atomic add: floating point
 #ifndef __HIP_DEVICE_COMPILE__ // for host code
 #define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
-#elif defined(__gfx9__) // for GPU code
+#elif defined(__gfx9__) || defined(__gfx12__) // for GPU code
 #define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
 #else // for GPU code
 #define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0
@@ -246,6 +241,18 @@ CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
 // workaround: compiler issue on gfx908
 #define CK_WORKAROUND_SWDEV_388832 1
 
+// workaround: compiler issue on gfx950
+#define CK_WORKAROUND_FP32_TO_FP4_SR_CONVERSION 1
+
+// workaround: compiler issue on gfx950
+#define CK_TEMP_DISABLE_FP4_TESTS 1
+
+// workaround: compiler issue on gfx950
+#define CK_WORKAROUND_FP16_TO_FP8_CONVERSION 1
+
+// workaround: compiler issue on gfx950
+#define CK_WORKAROUND_BF16_TO_FP8_CONVERSION 1
+
 // denorm test fix, necessary for gfx90a
 #ifndef CK_GFX90A_DENORM_WORKAROUND
 #define CK_GFX90A_DENORM_WORKAROUND 0
diff --git a/include/ck/config.h.in b/include/ck/config.h.in
index 994e60025d..306a6c2ff1 100644
--- a/include/ck/config.h.in
+++ b/include/ck/config.h.in
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * Copyright (c) 2025 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -115,6 +115,10 @@
 #cmakedefine CK_USE_WMMA @CK_USE_WMMA@
 #endif
 
+#ifndef CK_USE_WMMA_FP8
+#cmakedefine CK_USE_WMMA_FP8 @CK_USE_WMMA_FP8@
+#endif
+
 #ifndef CK_USE_GFX94
 #cmakedefine CK_USE_GFX94 @CK_USE_GFX94@
 #endif
diff --git a/include/ck/host_utility/device_prop.hpp b/include/ck/host_utility/device_prop.hpp
index e04e27b761..5439bbe1f0 100644
--- a/include/ck/host_utility/device_prop.hpp
+++ b/include/ck/host_utility/device_prop.hpp
@@ -5,11 +5,17 @@
 
 #ifndef __HIPCC_RTC__
 #include <string>
-#include <map>
+#include <string_view>
 #include <hip/hip_runtime.h>
 
 namespace ck {
 
+constexpr unsigned int fnv1a_hash(std::string_view str, unsigned int h = 2166136261u)
+{
+    return str.empty() ? h
+                       : fnv1a_hash(str.substr(1),
+                                    (h ^ static_cast<unsigned char>(str.front())) * 16777619u);
+}
 inline std::string get_device_name()
 {
     hipDeviceProp_t props{};
@@ -19,58 +25,49 @@ inline std::string get_device_name()
     {
         return std::string();
     }
-
     status = hipGetDeviceProperties(&props, device);
     if(status != hipSuccess)
     {
         return std::string();
     }
     const std::string raw_name(props.gcnArchName);
-
-    // https://github.com/ROCm/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40
-    static std::map<std::string, std::string> device_name_map = {
-        {"Ellesmere", "gfx803"},
-        {"Baffin", "gfx803"},
-        {"RacerX", "gfx803"},
-        {"Polaris10", "gfx803"},
-        {"Polaris11", "gfx803"},
-        {"Tonga", "gfx803"},
-        {"Fiji", "gfx803"},
-        {"gfx800", "gfx803"},
-        {"gfx802", "gfx803"},
-        {"gfx804", "gfx803"},
-        {"Vega10", "gfx900"},
-        {"gfx901", "gfx900"},
-        {"10.3.0 Sienna_Cichlid 18", "gfx1030"},
-    };
-
     const auto name = raw_name.substr(0, raw_name.find(':')); // str.substr(0, npos) returns str.
-
-    auto match = device_name_map.find(name);
-    if(match != device_name_map.end())
-        return match->second;
-    return name;
+    switch(fnv1a_hash(name))
+    {
+    // https://github.com/ROCm/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40
+    case fnv1a_hash("Ellesmere"):
+    case fnv1a_hash("Baffin"):
+    case fnv1a_hash("RacerX"):
+    case fnv1a_hash("Polaris10"):
+    case fnv1a_hash("Polaris11"):
+    case fnv1a_hash("Tonga"):
+    case fnv1a_hash("Fiji"):
+    case fnv1a_hash("gfx800"):
+    case fnv1a_hash("gfx802"):
+    case fnv1a_hash("gfx804"): return "gfx803";
+    case fnv1a_hash("Vega10"):
+    case fnv1a_hash("gfx901"): return "gfx900";
+    case fnv1a_hash("10.3.0 Sienna_Cichlid 18"): return "gfx1030";
+    default: return name;
+    }
 }
 
 inline bool is_xdl_supported()
 {
     return ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
-           ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
            ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950";
 }
 
 inline bool is_lds_direct_load_supported()
 {
     // Check if direct loads from global memory to LDS are supported.
-    return ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx940" ||
-           ck::get_device_name() == "gfx941" || ck::get_device_name() == "gfx942" ||
+    return ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx942" ||
            ck::get_device_name() == "gfx950";
 }
 
 inline bool is_bf16_atomic_supported()
 {
-    return ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
-           ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950";
+    return ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950";
 }
 
 inline bool is_gfx101_supported()
@@ -89,7 +86,9 @@ inline bool is_gfx103_supported()
 inline bool is_gfx11_supported()
 {
     return ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" ||
-           ck::get_device_name() == "gfx1102" || ck::get_device_name() == "gfx1103";
+           ck::get_device_name() == "gfx1102" || ck::get_device_name() == "gfx1103" ||
+           ck::get_device_name() == "gfx1150" || ck::get_device_name() == "gfx1151" ||
+           ck::get_device_name() == "gfx1152";
 }
 
 inline bool is_gfx12_supported()
diff --git a/include/ck/host_utility/flush_cache.hpp b/include/ck/host_utility/flush_cache.hpp
index 918fb28ea9..08b3aba2b3 100644
--- a/include/ck/host_utility/flush_cache.hpp
+++ b/include/ck/host_utility/flush_cache.hpp
@@ -8,6 +8,7 @@
 #include <vector>
 
 #include "ck/ck.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/stream_config.hpp"
 #include "ck/host_utility/hip_check_error.hpp"
 #include "ck/utility/flush_icache.hpp"
diff --git a/include/ck/host_utility/kernel_launch.hpp b/include/ck/host_utility/kernel_launch.hpp
index 5c1c1c4e60..11a1c9bbc0 100644
--- a/include/ck/host_utility/kernel_launch.hpp
+++ b/include/ck/host_utility/kernel_launch.hpp
@@ -6,6 +6,7 @@
 #include <hip/hip_runtime.h>
 
 #include "ck/ck.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/stream_config.hpp"
 #include "ck/host_utility/hip_check_error.hpp"
 
diff --git a/include/ck/library/utility/fill.hpp b/include/ck/library/utility/fill.hpp
index 3336041354..35625d142e 100644
--- a/include/ck/library/utility/fill.hpp
+++ b/include/ck/library/utility/fill.hpp
@@ -94,7 +94,7 @@ struct FillMonotonicSeq
     template <typename ForwardIter>
     void operator()(ForwardIter first, ForwardIter last) const
     {
-        std::generate(first, last, [=, n = init_value_]() mutable {
+        std::generate(first, last, [=, *this, n = init_value_]() mutable {
             auto tmp = n;
             n += step_;
             return tmp;
@@ -150,7 +150,7 @@ struct TransformIntoStructuralSparsity
     template <typename ForwardIter>
     void operator()(ForwardIter first, ForwardIter last) const
     {
-        std::for_each(first, last, [=, idx = 0](T& elem) mutable {
+        std::for_each(first, last, [=, *this, idx = 0](T& elem) mutable {
             auto tmp_idx = idx;
             idx += 1;
             return elem *= valid_sequences[tmp_idx % (sizeof(valid_sequences) / sizeof(T))];
diff --git a/include/ck/library/utility/host_tensor.hpp b/include/ck/library/utility/host_tensor.hpp
index 250ebd6721..71417ce7bf 100644
--- a/include/ck/library/utility/host_tensor.hpp
+++ b/include/ck/library/utility/host_tensor.hpp
@@ -6,6 +6,7 @@
 #include <algorithm>
 #include <cassert>
 #include <iostream>
+#include <fstream>
 #include <numeric>
 #include <thread>
 #include <utility>
@@ -252,7 +253,7 @@ struct ParallelTensorFunctor
             std::size_t iw_begin = it * work_per_thread;
             std::size_t iw_end   = std::min((it + 1) * work_per_thread, mN1d);
 
-            auto f = [=] {
+            auto f = [=, *this] {
                 for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
                 {
                     call_f_unpack_args(mF, GetNdIndices(iw));
@@ -323,7 +324,32 @@ struct Tensor
     explicit Tensor(const Tensor<FromT>& other) : Tensor(other.template CopyAsType<T>())
     {
     }
+    void savetxt(std::string file_name, std::string dtype = "float")
+    {
+        std::ofstream file(file_name);
 
+        if(file.is_open())
+        {
+            for(auto& itm : mData)
+            {
+                if(dtype == "float")
+                    file << ck::type_convert<float>(itm) << std::endl;
+                else if(dtype == "int")
+                    file << ck::type_convert<int>(itm) << std::endl;
+                else
+                    // TODO: we didn't implement operator<< for all custom
+                    // data types, here fall back to float in case compile error
+                    file << ck::type_convert<float>(itm) << std::endl;
+            }
+            file.close();
+        }
+        else
+        {
+            // Print an error message to the standard error
+            // stream if the file cannot be opened.
+            throw std::runtime_error(std::string("unable to open file:") + file_name);
+        }
+    }
     decltype(auto) GetLengths() const { return mDesc.GetLengths(); }
 
     decltype(auto) GetStrides() const { return mDesc.GetStrides(); }
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp
new file mode 100644
index 0000000000..ebe075b55d
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp
@@ -0,0 +1,363 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/warp/xdlops_gemm.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack,
+          bool TransposeC = false>
+struct BlockwiseGemmXdlops_mx_pipeline_base
+{
+    using ComputeTypeA = ADataType;
+    using ComputeTypeB = BDataType;
+    using AccType      = float; // for now only support V_MFMA_SCALE_F32
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    // Hardcode to 64, as HIP-provided "warpSize" would return 32 on RDNA GPUs.
+    static constexpr index_t WaveSize = 64;
+
+    static constexpr index_t A_K0 = ATileDesc{}.GetLength(I0);
+    static constexpr index_t B_K0 = BTileDesc{}.GetLength(I0);
+    static constexpr index_t A_K1 = ATileDesc{}.GetLength(I2);
+    static constexpr index_t B_K1 = BTileDesc{}.GetLength(I2);
+
+    static constexpr auto xdlops_gemm =
+        XdlopsGemm<ComputeTypeA, MPerXDL, NPerXDL, KPack, ComputeTypeB, TransposeC, true>{};
+
+    static constexpr index_t AMmaKStride = KPack;
+    static constexpr index_t BMmaKStride = KPack;
+
+    //> store rows/cols into thread registers in chunks of 16
+    //> e.g. [k0,...,k15,k64,...,k79] or [k0,...,k15,k32,...,k47]
+    static constexpr index_t KThreadChunk = 16;
+
+    static constexpr index_t KPerThread    = KPerBlock / xdlops_gemm.K0PerXdlops;
+    static constexpr index_t KRepeat       = KPerThread / KPack;
+    static constexpr index_t KPerInnerLoop = KPack;
+
+    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
+    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);
+
+    using HotLoopInstList =
+        ck::BlockwiseGemmXdlops_pipeline_hotloop_inst<BlockSize,
+                                                      MPerBlock,
+                                                      NPerBlock,
+                                                      KPerBlock,
+                                                      ABlockTransferSrcScalarPerVector,
+                                                      BBlockTransferSrcScalarPerVector,
+                                                      A_K1,
+                                                      B_K1,
+                                                      A_K1,
+                                                      B_K1,
+                                                      MRepeat,
+                                                      NRepeat,
+                                                      MPerXDL,
+                                                      NPerXDL,
+                                                      xdlops_gemm.KPerXdlops>;
+
+    static_assert(KPerThread % KPack == 0,
+                  "Wrong KPack setting; try increasing KPerThread or decreasing KPack");
+
+    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                              AccType,
+                              MRepeat * NRepeat,
+                              xdlops_gemm.GetRegSizePerXdlops(),
+                              true>
+        c_thread_buf_;
+
+    __host__ __device__ constexpr auto& GetCThreadBuffer() { return c_thread_buf_; }
+
+    __device__ static auto GetWaveIdx()
+    {
+        const index_t thread_id = ThisThreadBlock::GetThreadId();
+
+        constexpr auto threadid_to_wave_idx_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(MWaves, NWaves, WaveSize))),
+            make_tuple(Sequence<0, 1, 2>{}),
+            make_tuple(Sequence<0>{}));
+
+        return threadid_to_wave_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id));
+    }
+
+    __device__ static auto CalculateAThreadOriginDataIndex()
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_m = wave_idx[I0];
+
+        const auto xdlops_a_idx = xdlops_gemm.CalculateAThreadOriginDataIndex();
+
+        return make_tuple(0, waveId_m, xdlops_a_idx[I1], KThreadChunk * xdlops_a_idx[I0]);
+    }
+
+    __device__ static auto CalculateBThreadOriginDataIndex()
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_n = wave_idx[I1];
+
+        const auto xdlops_b_idx = xdlops_gemm.CalculateBThreadOriginDataIndex();
+
+        return make_tuple(0, waveId_n, xdlops_b_idx[I1], KThreadChunk * xdlops_b_idx[I0]);
+    }
+
+    template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
+    __device__ static auto
+        CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
+
+        const auto blk_idx = xdlops_gemm.GetBeginOfThreadBlk(xdlops_i, blk_i);
+
+        constexpr auto mrepeat_mwave_mperxdl_to_m_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, MPerXDL))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1, 2>{}));
+
+        constexpr auto nrepeat_nwave_nperxdl_to_n_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_unmerge_transform(make_tuple(NRepeat, NWaves, NPerXDL))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1, 2>{}));
+
+        const index_t c_thread_m = mrepeat_mwave_mperxdl_to_m_adaptor.CalculateBottomIndex(
+            make_tuple(m0, waveId_m, blk_idx[I0]))[I0];
+        const index_t c_thread_n = nrepeat_nwave_nperxdl_to_n_adaptor.CalculateBottomIndex(
+            make_tuple(n0, waveId_n, blk_idx[I1]))[I0];
+
+        return make_tuple(c_thread_m, c_thread_n);
+    }
+
+    using Tuple4 = decltype(CalculateAThreadOriginDataIndex());
+
+    /**
+     * @brief Constructor for BlockwiseGemmXdlops_mx_pipeline_base.
+     *
+     * This constructor initializes the thread copy objects for matrices A and B.
+     * It also performs several compile-time checks to ensure the correctness of the
+     * matrix tile descriptors.
+     *
+     * @param a_origin The origin data index for matrix A.
+     * @param b_origin The origin data index for matrix B.
+     *
+     * @note The constructor includes static assertions to ensure that:
+     * - The matrix tile descriptors for A and B are known at compile-time.
+     * - The number of threads in the thread block matches the product of MWaves, NWaves, and
+     * WaveSize.
+     * - The dimensions of the block are divisible by the product of the corresponding XDL and
+     * repeat dimensions.
+     */
+    __host__ __device__
+    BlockwiseGemmXdlops_mx_pipeline_base(Tuple4 a_origin = CalculateAThreadOriginDataIndex(),
+                                         Tuple4 b_origin = CalculateBThreadOriginDataIndex())
+        : a_thread_copy_(a_origin), b_thread_copy_(b_origin)
+    {
+        static_assert(AMmaTileDesc::IsKnownAtCompileTime() && BMmaTileDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(ThisThreadBlock::GetNumOfThread() == MWaves * NWaves * WaveSize,
+                      "ThisThreadBlock::GetNumOfThread() != MWaves * NWaves * WaveSize\n");
+
+        static_assert(MPerBlock % (MPerXDL * MRepeat) == 0 && NPerBlock % (NPerXDL * NRepeat) == 0,
+                      "wrong!");
+    }
+
+    // transposed XDL output supporting C_xdl' = B_xdl' * A_xdl'
+    __host__ __device__ static constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()
+    {
+        constexpr auto c_m0_m1_m2_n_tblk_lens = xdlops_gemm.GetCM0M1M2NThreadBlkLengths();
+
+        constexpr auto M0 = c_m0_m1_m2_n_tblk_lens[I0];
+        constexpr auto M1 = c_m0_m1_m2_n_tblk_lens[I1];
+        constexpr auto M2 = c_m0_m1_m2_n_tblk_lens[I2];
+        constexpr auto N  = c_m0_m1_m2_n_tblk_lens[I3];
+
+        return make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, I1, I1, N, M0, M1, M2));
+    }
+
+    // XDL output supporting C_xdl = A_xdl * B_xdl
+    __host__ __device__ static constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
+    {
+        constexpr auto c_m0_m1_m2_n_tblk_lens = xdlops_gemm.GetCM0M1M2NThreadBlkLengths();
+
+        constexpr auto M0 = c_m0_m1_m2_n_tblk_lens[I0];
+        constexpr auto M1 = c_m0_m1_m2_n_tblk_lens[I1];
+        constexpr auto M2 = c_m0_m1_m2_n_tblk_lens[I2];
+        constexpr auto N  = c_m0_m1_m2_n_tblk_lens[I3];
+
+        return make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, I1, I1, M0, M1, M2, N));
+    }
+
+    __host__ __device__ static constexpr auto GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
+    {
+        constexpr auto c_m0_m1_m2_n_tblk_lens = xdlops_gemm.GetCM0M1M2NThreadBlkLengths();
+
+        constexpr auto M0 = c_m0_m1_m2_n_tblk_lens[I0];
+        constexpr auto M1 = c_m0_m1_m2_n_tblk_lens[I1];
+        constexpr auto M2 = c_m0_m1_m2_n_tblk_lens[I2];
+        constexpr auto N  = c_m0_m1_m2_n_tblk_lens[I3];
+
+        return make_naive_tensor_descriptor_packed(
+            make_tuple(I1, Number<MRepeat>{}, Number<NRepeat>{}, I1, I1, M0, M1, M2, N));
+    }
+
+    // transposed XDL output supporting C_xdl' = B_xdl' * A_xdl'
+    __host__ __device__ static constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()
+    {
+        constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2 =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{},
+                                                           Number<NRepeat>{},
+                                                           Number<MWaves>{},
+                                                           Number<NWaves>{},
+                                                           Number<MPerXDL>{},
+                                                           Number<NPerXDL>{}));
+
+        return xdlops_gemm.MakeCDescriptor_M0_N0_M1_N1_M2_N2_N3_N4(c_block_desc_m0_n0_m1_n1_m2_n2);
+    }
+
+    // XDL output supporting C_xdl = A_xdl * B_xdl
+    __host__ __device__ static constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
+    {
+        constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2 =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{},
+                                                           Number<NRepeat>{},
+                                                           Number<MWaves>{},
+                                                           Number<NWaves>{},
+                                                           Number<MPerXDL>{},
+                                                           Number<NPerXDL>{}));
+
+        return xdlops_gemm.MakeCDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_block_desc_m0_n0_m1_n1_m2_n2);
+    }
+
+    __host__ __device__ static constexpr auto GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
+    {
+        constexpr auto c_block_desc_g_m0_n0_m1_n1_m2_n2 =
+            make_naive_tensor_descriptor_packed(make_tuple(I1,
+                                                           Number<MRepeat>{},
+                                                           Number<NRepeat>{},
+                                                           Number<MWaves>{},
+                                                           Number<NWaves>{},
+                                                           Number<MPerXDL>{},
+                                                           Number<NPerXDL>{}));
+
+        return xdlops_gemm.MakeCDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(
+            c_block_desc_g_m0_n0_m1_n1_m2_n2);
+    }
+
+    template <typename CGridDesc_M_N>
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        const auto M = c_grid_desc_m_n.GetLength(I0);
+        const auto N = c_grid_desc_m_n.GetLength(I1);
+
+        const auto c_grid_desc_m0_n0_m1_n1_m2_n2 = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(M / (MWaves * MPerXDL), MWaves, MPerXDL)),
+                       make_unmerge_transform(make_tuple(N / (NWaves * NPerXDL), NWaves, NPerXDL))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5>{}));
+
+        return xdlops_gemm.MakeCDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m0_n0_m1_n1_m2_n2);
+    }
+
+    template <typename CGridDesc_G_M_N>
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_G_M_N& c_grid_desc_g_m_n)
+    {
+        const auto G = c_grid_desc_g_m_n.GetLength(I0);
+        const auto M = c_grid_desc_g_m_n.GetLength(I1);
+        const auto N = c_grid_desc_g_m_n.GetLength(I2);
+
+        const auto c_grid_desc_g_m0_n0_m1_n1_m2_n2 = transform_tensor_descriptor(
+            c_grid_desc_g_m_n,
+            make_tuple(make_pass_through_transform(G),
+                       make_unmerge_transform(make_tuple(M / (MWaves * MPerXDL), MWaves, MPerXDL)),
+                       make_unmerge_transform(make_tuple(N / (NWaves * NPerXDL), NWaves, NPerXDL))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 3, 5>{}, Sequence<2, 4, 6>{}));
+
+        return xdlops_gemm.MakeCDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(
+            c_grid_desc_g_m0_n0_m1_n1_m2_n2);
+    }
+
+    static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_k;
+    static constexpr BMmaTileDesc b_block_desc_n0_n1_n2_k;
+
+    protected:
+    // M1, N1 as double buffer index
+    // Read buffer + Compute buffer
+    // A[M0, M1, M2, KPack]
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor(
+        make_tuple(Number<MRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}),
+        make_tuple(
+            Number<KPack>{}, Number<KRepeat * MRepeat * KPack>{}, Number<MRepeat * KPack>{}, I1));
+
+    // B[N0, N1, N2, KPack]
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor(
+        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}),
+        make_tuple(
+            Number<KPack>{}, Number<KRepeat * NRepeat * KPack>{}, Number<NRepeat * KPack>{}, I1));
+
+    // C[M, N, NumRegXdlops]
+    static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, xdlops_gemm.GetRegSizePerXdlops()));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                                         ComputeTypeA,
+                                                         decltype(a_block_desc_m0_m1_m2_k),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, KThreadChunk>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         A_K1,
+                                                         A_K1>;
+
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<BDataType,
+                                                         ComputeTypeB,
+                                                         decltype(b_block_desc_n0_n1_n2_k),
+                                                         decltype(b_thread_desc_),
+                                                         Sequence<1, 1, 1, KThreadChunk>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         B_K1,
+                                                         B_K1>;
+
+    AThreadCopy a_thread_copy_;
+    BThreadCopy b_thread_copy_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmma_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmma_selector.hpp
new file mode 100644
index 0000000000..2fdabc6bc7
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmma_selector.hpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v3.hpp"
+
+namespace ck {
+
+template <BlockGemmPipelineVersion BlkGemmPipelineVer,
+          BlockGemmPipelineScheduler BlkGemmPipeSche,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeTypeA,
+          typename ComputeTypeB,
+          typename AccDataType,
+          typename AWmmaTileDesc,
+          typename BWmmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerWmma,
+          index_t NPerWmma,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack>
+constexpr auto BlockGemmPipeline_Selector()
+{
+    if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+    {
+        return BlockwiseGemmWmmaops_pipeline_v3<BlkGemmPipeSche,
+                                                BlockSize,
+                                                ADataType,
+                                                BDataType,
+                                                ComputeTypeA,
+                                                ComputeTypeB,
+                                                AccDataType,
+                                                AWmmaTileDesc,
+                                                BWmmaTileDesc,
+                                                ABlockTransferSrcScalarPerVector,
+                                                BBlockTransferSrcScalarPerVector,
+                                                MPerBlock,
+                                                NPerBlock,
+                                                KPerBlock,
+                                                MPerWmma,
+                                                NPerWmma,
+                                                MRepeat,
+                                                NRepeat,
+                                                KPack>{};
+    }
+    else
+    {
+        static_assert(false, "BlockGemmPipeline configuration is not available");
+    }
+}
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops.hpp
new file mode 100644
index 0000000000..31c4729760
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops.hpp
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t ABufferLoadWidth,
+          index_t BBufferLoadWidth,
+          index_t ALDSWriteWidth,
+          index_t BLDSWriteWidth,
+          index_t ALDSReadWidth,
+          index_t BLDSReadWidth,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t MPerWmma,
+          index_t NPerWmma,
+          index_t KPerWmma>
+struct BlockwiseGemmWmmaops_pipeline_hotloop_inst
+{
+    static constexpr index_t WaveSize = 32;
+    static constexpr index_t WaveNumM = MPerBlock / (MRepeat * MPerWmma);
+    static constexpr index_t WaveNumN = NPerBlock / (NRepeat * NPerWmma);
+
+    static constexpr index_t A_LDS_Read_Width = ALDSReadWidth;
+    static constexpr index_t B_LDS_Read_Width = BLDSReadWidth;
+
+    static constexpr index_t A_Buffer_Load_Inst_Num =
+        MPerBlock * KPerBlock / (BlockSize * ABufferLoadWidth);
+    static constexpr index_t B_Buffer_Load_Inst_Num =
+        NPerBlock * KPerBlock / (BlockSize * BBufferLoadWidth);
+
+    static constexpr index_t A_LDS_Write_Inst_Num =
+        MPerBlock * KPerBlock / (BlockSize * ALDSWriteWidth);
+    static constexpr index_t B_LDS_Write_Inst_Num =
+        NPerBlock * KPerBlock / (BlockSize * BLDSWriteWidth);
+
+    static constexpr index_t A_LDS_Read_Inst_Num =
+        WaveNumN * MPerBlock * KPerBlock / (BlockSize * ALDSReadWidth);
+    static constexpr index_t B_LDS_Read_Inst_Num =
+        WaveNumM * NPerBlock * KPerBlock / (BlockSize * BLDSReadWidth);
+
+    static constexpr index_t C_WMMA_Inst_Num = MPerBlock * NPerBlock * KPerBlock /
+                                               (BlockSize / WaveSize) /
+                                               (MPerWmma * NPerWmma * KPerWmma);
+
+    static constexpr auto Print()
+    {
+        printf(" Blk/Wave Size: %d, %d, M/N/K PerBlk: %d, %d, %d, M/N/K PerWmma: %d, %d, %d\n",
+               BlockSize,
+               WaveSize,
+               MPerBlock,
+               NPerBlock,
+               KPerBlock,
+               MPerWmma,
+               NPerWmma,
+               KPerWmma);
+
+        printf(" A/B buffer load inst: %d, %d\n A/B LDS write inst: %d, %d\n A/B LDS read inst: "
+               "%d, %d\n C WMMA inst: %d\n"
+               "A/B LDS read width: %d, %d, A/B LDS write width: %d, %d, A/B buffer load width: "
+               "%d, %d\n",
+               A_Buffer_Load_Inst_Num,
+               B_Buffer_Load_Inst_Num,
+               A_LDS_Write_Inst_Num,
+               B_LDS_Write_Inst_Num,
+               A_LDS_Read_Inst_Num,
+               B_LDS_Read_Inst_Num,
+               C_WMMA_Inst_Num,
+               A_LDS_Read_Width,
+               B_LDS_Read_Width,
+               ALDSWriteWidth,
+               BLDSWriteWidth,
+               ABufferLoadWidth,
+               BBufferLoadWidth);
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp
new file mode 100644
index 0000000000..a63d32802e
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp
@@ -0,0 +1,309 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/warp/wmma_gemm.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeTypeA,
+          typename ComputeTypeB,
+          typename AccDataType,
+          typename AWmmaTileDesc,
+          typename BWmmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerWmma,
+          index_t NPerWmma,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack,
+          bool TransposeC = false>
+struct BlockwiseGemmWmmaops_pipeline_base
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I5 = Number<5>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    static constexpr index_t WaveSize = 32;
+
+    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWmma);
+    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWmma);
+
+#if defined(__gfx12__)
+    static constexpr index_t A_KRow = 2;
+    static constexpr index_t B_KRow = 2;
+#else
+    static constexpr index_t A_KRow = 1;
+    static constexpr index_t B_KRow = 1;
+#endif
+
+    static constexpr index_t A_K1 = AWmmaTileDesc{}.GetLength(I5);
+    static constexpr index_t B_K1 = BWmmaTileDesc{}.GetLength(I5);
+
+    static_assert(KPack % (A_K1 * A_KRow) == 0, "wrong!");
+    static_assert(KPack % (B_K1 * B_KRow) == 0, "wrong!");
+
+    static constexpr auto wmma_gemm =
+        WmmaGemm<ADataType, BDataType, AccDataType, MPerWmma, NPerWmma, KPack, TransposeC>{};
+
+    static constexpr index_t KRepeat = KPerBlock / KPack;
+
+    static constexpr auto WmmaK = Number<wmma_gemm.wmma_instr.k_per_wmma>{};
+
+    using HotLoopInstList =
+        ck::BlockwiseGemmWmmaops_pipeline_hotloop_inst<BlockSize,
+                                                       MPerBlock,
+                                                       NPerBlock,
+                                                       KPerBlock,
+                                                       ABlockTransferSrcScalarPerVector,
+                                                       BBlockTransferSrcScalarPerVector,
+                                                       A_K1,
+                                                       B_K1,
+                                                       A_K1,
+                                                       B_K1,
+                                                       MRepeat,
+                                                       NRepeat,
+                                                       MPerWmma,
+                                                       NPerWmma,
+                                                       wmma_gemm.wmma_instr.k_per_wmma>;
+
+    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                              AccDataType,
+                              MRepeat * NRepeat,
+                              wmma_gemm.GetRegSizePerWmma(),
+                              true>
+        c_thread_buf_;
+
+    __host__ __device__ constexpr auto& GetCThreadBuffer() { return c_thread_buf_; }
+
+    __device__ static auto GetWaveIdx()
+    {
+        const index_t thread_id = ThisThreadBlock::GetThreadId();
+
+        constexpr auto threadid_to_wave_idx_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(MWaves, NWaves, WaveSize))),
+            make_tuple(Sequence<0, 1, 2>{}),
+            make_tuple(Sequence<0>{}));
+
+        return threadid_to_wave_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id));
+    }
+
+    __device__ static auto CalculateAThreadOriginDataIndex()
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_m = wave_idx[I0];
+
+        const auto wmma_a_idx = wmma_gemm.CalculateAThreadOriginDataIndex();
+
+#if defined(__gfx12__)
+        const auto wmma_krow = wmma_gemm.GetSubGroupId();
+#else
+        const auto wmma_krow = 0;
+#endif
+
+        //  |KRepeat   |MRepeat|MWave    |KRow  |MLane  |KPack
+        return make_tuple(0, 0, waveId_m, wmma_krow, wmma_a_idx, 0);
+    }
+
+    __device__ static auto CalculateBThreadOriginDataIndex()
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_n = wave_idx[I1];
+
+        const auto wmma_b_idx = wmma_gemm.CalculateBThreadOriginDataIndex();
+
+#if defined(__gfx12__)
+        const auto wmma_krow = wmma_gemm.GetSubGroupId();
+#else
+        const auto wmma_krow = 0;
+#endif
+
+        //  |KRepeat   |NRepeat|Nwave     |KRow  |NLane  |KPack
+        return make_tuple(0, 0, waveId_n, wmma_krow, wmma_b_idx, 0);
+    }
+
+    template <index_t m0, index_t n0>
+    __device__ static auto CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>)
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
+
+        const auto blk_idx = wmma_gemm.GetBeginOfThreadBlk();
+
+        constexpr auto mrepeat_mwave_mperwmma_to_m_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, MPerWmma))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1, 2>{}));
+
+        constexpr auto nrepeat_nwave_nperwmma_to_n_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_unmerge_transform(make_tuple(NRepeat, NWaves, NPerWmma))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1, 2>{}));
+
+        const index_t c_thread_m = mrepeat_mwave_mperwmma_to_m_adaptor.CalculateBottomIndex(
+            make_tuple(m0, waveId_m, blk_idx[I0]))[I0];
+        const index_t c_thread_n = nrepeat_nwave_nperwmma_to_n_adaptor.CalculateBottomIndex(
+            make_tuple(n0, waveId_n, blk_idx[I1]))[I0];
+
+        return make_tuple(c_thread_m, c_thread_n);
+    }
+
+    using Tuple6 = decltype(CalculateAThreadOriginDataIndex());
+
+    /**
+     * @brief Constructor for BlockwiseGemmWmmaops_pipeline_base.
+     *
+     * This constructor initializes the thread copy objects for matrices A and B.
+     * It also performs several compile-time checks to ensure the correctness of the
+     * matrix tile descriptors.
+     *
+     * @param a_origin The origin data index for matrix A.
+     * @param b_origin The origin data index for matrix B.
+     *
+     * @note The constructor includes static assertions to ensure that:
+     * - The matrix tile descriptors for A and B are known at compile-time.
+     * - The number of threads in the thread block matches the product of MWaves, NWaves, and
+     * WaveSize.
+     * - The dimensions of the block are divisible by the product of the corresponding WMMA and
+     * repeat dimensions.
+     */
+    __host__ __device__
+    BlockwiseGemmWmmaops_pipeline_base(Tuple6 a_origin = CalculateAThreadOriginDataIndex(),
+                                       Tuple6 b_origin = CalculateBThreadOriginDataIndex())
+        : a_thread_copy_(a_origin), b_thread_copy_(b_origin)
+    {
+        static_assert(AWmmaTileDesc::IsKnownAtCompileTime() &&
+                          BWmmaTileDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(ThisThreadBlock::GetNumOfThread() == MWaves * NWaves * WaveSize,
+                      "ThisThreadBlock::GetNumOfThread() != MWaves * NWaves * WaveSize\n");
+
+        static_assert(MPerBlock % (MPerWmma * MRepeat) == 0 &&
+                          NPerBlock % (NPerWmma * NRepeat) == 0,
+                      "wrong!");
+    }
+
+    __host__ __device__ static constexpr auto
+    GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs()
+    {
+        constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens =
+            wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths();
+
+        constexpr auto MAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2];
+        constexpr auto AccStride = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I3];
+        return make_naive_tensor_descriptor(
+            //        |MRepeat           |MWave |MSubGroup |NRepeat           |NWave
+            //        |NThreadPerSubGroup |MAccVgprs
+            make_tuple(Number<MRepeat>{}, I1, I1, Number<NRepeat>{}, I1, I1, MAccVgprs),
+            make_tuple(Number<NRepeat>{} * MAccVgprs * AccStride,
+                       Number<NRepeat>{} * MAccVgprs * AccStride,
+                       Number<NRepeat>{} * MAccVgprs * AccStride,
+                       MAccVgprs * AccStride,
+                       MAccVgprs * AccStride,
+                       MAccVgprs * AccStride,
+                       AccStride));
+    }
+
+    __host__ __device__ static constexpr auto
+    GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs()
+    {
+        constexpr auto c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{},
+                                                           Number<MWaves>{},
+                                                           Number<MPerWmma>{},
+                                                           Number<NRepeat>{},
+                                                           Number<NWaves>{},
+                                                           Number<NPerWmma>{}));
+
+        return wmma_gemm
+            .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs(
+                c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma);
+    }
+
+    // Describe how data allocated in thread copy src buffer
+    // M0_M1_M2 = MRepeat_MWave_MPerWmma, N0_N1_N2 = NRepeat_NWave_NPerWmma
+    static constexpr AWmmaTileDesc a_block_desc_k0_m0_m1_m2_k1;
+    static constexpr BWmmaTileDesc b_block_desc_k0_n0_n1_n2_k1;
+
+    protected:
+    static constexpr auto a_thread_desc_ =
+        make_naive_tensor_descriptor(make_tuple(Number<KPack / A_K1 / A_KRow>{},
+                                                Number<MRepeat>{},
+                                                Number<KRepeat>{},
+                                                I1,
+                                                I1,
+                                                Number<A_K1>{}),
+                                     make_tuple(Number<A_K1>{},
+                                                Number<KPack / A_KRow>{},
+                                                Number<KPack * A_K1>{},
+                                                Number<A_K1>{},
+                                                Number<A_K1>{},
+                                                Number<1>{}));
+
+    static constexpr auto b_thread_desc_ =
+        make_naive_tensor_descriptor(make_tuple(Number<KPack / B_K1 / B_KRow>{},
+                                                Number<NRepeat>{},
+                                                Number<KRepeat>{},
+                                                I1,
+                                                I1,
+                                                Number<B_K1>{}),
+                                     make_tuple(Number<B_K1>{},
+                                                Number<KPack / B_KRow>{},
+                                                Number<KPack * B_K1>{},
+                                                Number<B_K1>{},
+                                                Number<B_K1>{},
+                                                Number<1>{}));
+
+    // C[M, N, NumRegWmma]
+    static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, wmma_gemm.GetRegSizePerWmma()));
+
+    using AThreadCopy =
+        ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                         ADataType,
+                                         decltype(a_block_desc_k0_m0_m1_m2_k1),
+                                         decltype(a_thread_desc_),
+                                         Sequence<KPack / A_K1 / A_KRow, 1, 1, 1, 1, A_K1>,
+                                         Sequence<0, 1, 2, 3, 4, 5>,
+                                         5,
+                                         A_K1,
+                                         A_K1>;
+
+    using BThreadCopy =
+        ThreadwiseTensorSliceTransfer_v4<BDataType,
+                                         BDataType,
+                                         decltype(b_block_desc_k0_n0_n1_n2_k1),
+                                         decltype(b_thread_desc_),
+                                         Sequence<KPack / B_K1 / B_KRow, 1, 1, 1, 1, B_K1>,
+                                         Sequence<0, 1, 2, 3, 4, 5>,
+                                         5,
+                                         B_K1,
+                                         B_K1>;
+
+    AThreadCopy a_thread_copy_;
+    BThreadCopy b_thread_copy_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v3.hpp
new file mode 100644
index 0000000000..2fb95f0f8d
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v3.hpp
@@ -0,0 +1,466 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp"
+
+namespace ck {
+
+// Compute optimized pipeline
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeTypeA,
+          typename ComputeTypeB,
+          typename AccDataType,
+          typename AWmmaTileDesc,
+          typename BWmmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerWmma,
+          index_t NPerWmma,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack>
+struct BlockwiseGemmWmmaops_pipeline_v3
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeTypeA,
+          typename ComputeTypeB,
+          typename AccDataType,
+          typename AWmmaTileDesc,
+          typename BWmmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerWmma,
+          index_t NPerWmma,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack>
+struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
+                                        BlockSize,
+                                        ADataType,
+                                        BDataType,
+                                        ComputeTypeA,
+                                        ComputeTypeB,
+                                        AccDataType,
+                                        AWmmaTileDesc,
+                                        BWmmaTileDesc,
+                                        ABlockTransferSrcScalarPerVector,
+                                        BBlockTransferSrcScalarPerVector,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        MPerWmma,
+                                        NPerWmma,
+                                        MRepeat,
+                                        NRepeat,
+                                        KPack>
+    : BlockwiseGemmWmmaops_pipeline_base<BlockSize,
+                                         ADataType,
+                                         BDataType,
+                                         ComputeTypeA,
+                                         ComputeTypeB,
+                                         AccDataType,
+                                         AWmmaTileDesc,
+                                         BWmmaTileDesc,
+                                         ABlockTransferSrcScalarPerVector,
+                                         BBlockTransferSrcScalarPerVector,
+                                         MPerBlock,
+                                         NPerBlock,
+                                         KPerBlock,
+                                         MPerWmma,
+                                         NPerWmma,
+                                         MRepeat,
+                                         NRepeat,
+                                         KPack>
+{
+    using Base = BlockwiseGemmWmmaops_pipeline_base<BlockSize,
+                                                    ADataType,
+                                                    BDataType,
+                                                    ComputeTypeA,
+                                                    ComputeTypeB,
+                                                    AccDataType,
+                                                    AWmmaTileDesc,
+                                                    BWmmaTileDesc,
+                                                    ABlockTransferSrcScalarPerVector,
+                                                    BBlockTransferSrcScalarPerVector,
+                                                    MPerBlock,
+                                                    NPerBlock,
+                                                    KPerBlock,
+                                                    MPerWmma,
+                                                    NPerWmma,
+                                                    MRepeat,
+                                                    NRepeat,
+                                                    KPack>;
+    using Base::I0;
+
+    using Base::A_K1;
+    using Base::A_KRow;
+    using Base::B_K1;
+    using Base::B_KRow;
+    using Base::KRepeat;
+    using Base::WmmaK;
+
+    using Base::wmma_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::
+        GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs;
+    using Base::GetCThreadBuffer;
+    using Base::
+        GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs;
+
+    using Base::a_block_desc_k0_m0_m1_m2_k1;
+    using Base::b_block_desc_k0_n0_n1_n2_k1;
+
+    static constexpr index_t PrefetchStages  = 2;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 1;
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        ignore = num_loop;
+        return TailNumber::Full;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        // TODO: Calculation of the number of instructions may require changes for WMMA
+        /*
+        // A/B split schedule
+        // compiler is likely to use ds_read2 when instruction width smaller than 16bytes
+        constexpr auto num_ds_read_inst_a =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16
+                ? HotLoopInstList::A_LDS_Read_Inst_Num
+                : HotLoopInstList::A_LDS_Read_Inst_Num / 2;
+        constexpr auto num_ds_read_inst_b =
+            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16
+                ? HotLoopInstList::B_LDS_Read_Inst_Num
+                : HotLoopInstList::B_LDS_Read_Inst_Num / 2;
+
+        constexpr auto num_ds_write_inst_a = HotLoopInstList::A_LDS_Write_Inst_Num;
+        constexpr auto num_ds_write_inst_b = HotLoopInstList::B_LDS_Write_Inst_Num;
+
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
+
+        constexpr auto num_wmma_inst = HotLoopInstList::C_WMMA_Inst_Num;
+
+        constexpr auto wmma_cycle = NPerWmma == 16 ? 16 : 32;
+        constexpr auto ds_read_a_issue_cycle =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_b_issue_cycle =
+            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_a_wmma_rate =
+            (wmma_cycle - 4 + 2 * ds_read_a_issue_cycle - 1) / (2 * ds_read_a_issue_cycle);
+        constexpr auto ds_read_b_wmma_rate =
+            (wmma_cycle - 4 + 2 * ds_read_b_issue_cycle - 1) / (2 * ds_read_b_issue_cycle);
+
+        constexpr auto num_dsread_a_wmma =
+            (num_ds_read_inst_a + ds_read_a_wmma_rate - 1) / ds_read_a_wmma_rate;
+        constexpr auto num_dsread_b_wmma =
+            (num_ds_read_inst_b + ds_read_b_wmma_rate - 1) / ds_read_b_wmma_rate;
+
+        // stage 1
+        // Separate this part?
+        // constexpr auto num_wmma_per_ds_read = sizeof(ComputeDataType) / sizeof(ADataType) >
+        //                                               sizeof(ComputeDataType) / sizeof(BDataType)
+        //                                           ? sizeof(ComputeDataType) / sizeof(ADataType)
+        //                                           : sizeof(ComputeDataType) / sizeof(BDataType);
+        constexpr auto num_wmma_stage1 = num_wmma_inst - (num_dsread_a_wmma + num_dsread_b_wmma);
+        constexpr auto num_wmma_per_issue =
+            num_wmma_stage1 / (num_buffer_load_inst_a + num_buffer_load_inst_b);
+        constexpr auto num_dswrite_per_issue_a = num_ds_write_inst_a / num_buffer_load_inst_a;
+        constexpr auto num_dswrite_per_issue_b = num_ds_write_inst_b / num_buffer_load_inst_b;
+
+        static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
+            ignore = i;
+            static_for<0, num_dswrite_per_issue_a, 1>{}([&](auto idswrite) {
+                ignore = idswrite;
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // WMMA
+            });
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(
+                0x008, num_wmma_per_issue - num_dswrite_per_issue_a, 0); // WMMA
+        });
+        static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
+            ignore = i;
+            static_for<0, num_dswrite_per_issue_b, 1>{}([&](auto idswrite) {
+                ignore = idswrite;
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // WMMA
+            });
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(
+                0x008, num_wmma_per_issue - num_dswrite_per_issue_b, 0); // WMMA
+        });
+
+        // stage 2
+        static_for<0, num_dsread_a_wmma, 1>{}([&](auto i) {
+            if constexpr((num_ds_read_inst_a - (i + 1) * ds_read_a_wmma_rate) >=
+                         ds_read_a_wmma_rate)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_wmma_rate, 0); // DS read
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100,
+                                                     num_ds_read_inst_a - (num_dsread_a_wmma - 1) *
+                                                                              ds_read_a_wmma_rate,
+                                                     0); // DS read
+            }
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // WMMA
+        });
+
+        static_for<0, num_dsread_b_wmma, 1>{}([&](auto i) {
+            if constexpr((num_ds_read_inst_b - (i + 1) * ds_read_b_wmma_rate) >=
+                         ds_read_b_wmma_rate)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_b_wmma_rate, 0); // DS read
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100,
+                                                     num_ds_read_inst_b - (num_dsread_b_wmma - 1) *
+                                                                              ds_read_b_wmma_rate,
+                                                     0); // DS read
+            }
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // WMMA
+        });
+        */
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer>
+    __device__ void Run(const AGridDesc& a_grid_desc,
+                        const ABlockDesc& a_block_desc,
+                        ABlockTransfer& a_blockwise_copy,
+                        const AGridBuffer& a_grid_buf,
+                        ABlockBuffer& a_block_buf,
+                        const ABlockTransferStep& a_block_copy_step,
+                        const BGridDesc& b_grid_desc,
+                        const BBlockDesc& b_block_desc,
+                        BBlockTransfer& b_blockwise_copy,
+                        const BGridBuffer& b_grid_buf,
+                        BBlockBuffer& b_block_buf,
+                        const BBlockTransferStep& b_block_copy_step,
+                        CThreadBuffer& c_thread_buf,
+                        index_t num_loop) const
+    {
+        __builtin_amdgcn_sched_barrier(0);
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        // Global prefetch 1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Local prefill 1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+        // Global prefetch 2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        // Local prefetch 1
+        block_sync_lds();
+        static_for<0, KRepeat, 1>{}([&](auto k0) {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                a_thread_copy_.Run(
+                    a_block_desc_k0_m0_m1_m2_k1,
+                    make_tuple(Number<k0 * KPack / A_K1 / A_KRow>{}, m0, I0, I0, I0, I0),
+                    a_block_buf,
+                    a_thread_desc_,
+                    make_tuple(I0, m0, k0, I0, I0, I0),
+                    a_thread_buf);
+            });
+            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                b_thread_copy_.Run(
+                    b_block_desc_k0_n0_n1_n2_k1,
+                    make_tuple(Number<k0 * KPack / B_K1 / B_KRow>{}, n0, I0, I0, I0, I0),
+                    b_block_buf,
+                    b_thread_desc_,
+                    make_tuple(I0, n0, k0, I0, I0, I0),
+                    b_thread_buf);
+            });
+        });
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                block_sync_lds();
+
+                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+                a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            vector_type<ComputeTypeA, KPack / A_KRow> a_thread_vec;
+                            vector_type<ComputeTypeB, KPack / B_KRow> b_thread_vec;
+
+                            static_for<0, KPack / A_KRow, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(ik / A_K1, m0, k0, 0, 0, ik % A_K1))>{}];
+                            });
+                            static_for<0, KPack / B_KRow, 1>{}([&](auto ik) {
+                                b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(ik / B_K1, n0, k0, 0, 0, ik % B_K1))>{}];
+                            });
+
+                            using wmma_input_type_a =
+                                typename vector_type<ComputeTypeA, WmmaK / A_KRow>::type;
+                            using wmma_input_type_b =
+                                typename vector_type<ComputeTypeB, WmmaK / B_KRow>::type;
+
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                            wmma_gemm.Run(a_thread_vec.template AsType<wmma_input_type_a>(),
+                                          b_thread_vec.template AsType<wmma_input_type_b>(),
+                                          c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        });
+                    });
+                });
+
+                block_sync_lds();
+
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        a_thread_copy_.Run(
+                            a_block_desc_k0_m0_m1_m2_k1,
+                            make_tuple(Number<k0 * KPack / A_K1 / A_KRow>{}, m0, I0, I0, I0, I0),
+                            a_block_buf,
+                            a_thread_desc_,
+                            make_tuple(I0, m0, k0, I0, I0, I0),
+                            a_thread_buf);
+                    });
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        b_thread_copy_.Run(
+                            b_block_desc_k0_n0_n1_n2_k1,
+                            make_tuple(Number<k0 * KPack / B_K1 / B_KRow>{}, n0, I0, I0, I0, I0),
+                            b_block_buf,
+                            b_thread_desc_,
+                            make_tuple(I0, n0, k0, I0, I0, I0),
+                            b_thread_buf);
+                    });
+                });
+
+                HotLoopScheduler();
+                __builtin_amdgcn_sched_barrier(0);
+
+                i += 1;
+            } while(i < (num_loop - 1));
+        }
+        // tail
+        if constexpr(TailNum == TailNumber::Full)
+        {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeTypeA, KPack / A_KRow> a_thread_vec;
+                        vector_type<ComputeTypeB, KPack / B_KRow> b_thread_vec;
+
+                        static_for<0, KPack / A_KRow, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(ik / A_K1, m0, k0, 0, 0, ik % A_K1))>{}];
+                        });
+                        static_for<0, KPack / B_KRow, 1>{}([&](auto ik) {
+                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(ik / B_K1, n0, k0, 0, 0, ik % B_K1))>{}];
+                        });
+
+                        using wmma_input_type_a =
+                            typename vector_type<ComputeTypeA, WmmaK / A_KRow>::type;
+                        using wmma_input_type_b =
+                            typename vector_type<ComputeTypeB, WmmaK / B_KRow>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        wmma_gemm.Run(a_thread_vec.template AsType<wmma_input_type_a>(),
+                                      b_thread_vec.template AsType<wmma_input_type_b>(),
+                                      c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+            // Let's leak last WMMA block to epilogue region, cover the potential lds-shuffle
+            // latency
+            // __builtin_amdgcn_sched_barrier(0);
+        }
+    }
+
+    protected:
+    using Base::a_thread_copy_;
+    using Base::a_thread_desc_;
+    using Base::b_thread_copy_;
+    using Base::b_thread_desc_;
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v1.hpp
new file mode 100644
index 0000000000..7a565fbaa7
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v1.hpp
@@ -0,0 +1,547 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Compute optimized pipeline
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPacks>
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v1
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v1<BlockGemmPipelineScheduler::Intrawave,
+                                                            BlockSize,
+                                                            ADataType,
+                                                            BDataType,
+                                                            ComputeDataType,
+                                                            AccDataType,
+                                                            ATileDesc,
+                                                            BTileDesc,
+                                                            AMmaTileDesc,
+                                                            BMmaTileDesc,
+                                                            ABlockTransferSrcScalarPerVector,
+                                                            BBlockTransferSrcScalarPerVector,
+                                                            MPerBlock,
+                                                            NPerBlock,
+                                                            KPerBlock,
+                                                            MPerXDL,
+                                                            NPerXDL,
+                                                            MRepeat,
+                                                            NRepeat,
+                                                            KPack>
+    : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                        ADataType,
+                                        BDataType,
+                                        ComputeDataType,
+                                        AccDataType,
+                                        ATileDesc,
+                                        BTileDesc,
+                                        AMmaTileDesc,
+                                        BMmaTileDesc,
+                                        ABlockTransferSrcScalarPerVector,
+                                        BBlockTransferSrcScalarPerVector,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        MPerXDL,
+                                        NPerXDL,
+                                        MRepeat,
+                                        NRepeat,
+                                        KPack>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack>;
+    using Base::A_K1;
+    using Base::B_K1;
+    using Base::I0;
+    using Base::I1;
+    using Base::KRepeat;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::AMmaKStride;
+    using Base::BMmaKStride;
+
+    static constexpr index_t PrefetchStages  = 2;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 2;
+
+    template <typename TileDesc_M0_M1_M2_K>
+    __host__ __device__ static constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K&)
+    {
+        constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
+        constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
+        constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
+        constexpr index_t K2 = KPack;
+        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K0 = KRepeat;
+
+        return transform_tensor_descriptor(
+            TileDesc_M0_M1_M2_K{},
+            make_tuple(
+                make_pass_through_transform(Number<M0>{}),
+                make_pass_through_transform(Number<M1>{}),
+                make_pass_through_transform(Number<M2>{}),
+                make_unmerge_transform(make_tuple(Number<K0>{}, Number<K1>{}, Number<K2>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}));
+    }
+
+    static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2 =
+        MakeAGemmMmaTileDescriptor(a_block_desc_m0_m1_m2_k);
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
+
+        // B global
+        static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+        });
+
+        // A global
+        static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+        });
+
+        // A local
+        static_for<0, num_ds_read_inst_a / 2, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS read
+        });
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer>
+    __device__ void Run(const AGridDesc& a_grid_desc,
+                        const ABlockDesc& a_block_desc,
+                        ABlockTransfer& a_blockwise_copy,
+                        const AGridBuffer& a_grid_buf,
+                        ABlockBuffer& a_block_buf,
+                        const ABlockTransferStep& a_block_copy_step,
+                        const BGridDesc& b_grid_desc,
+                        BBlockTransfer& b_blockwise_copy,
+                        const BGridBuffer& b_grid_buf,
+                        BBlockBuffer& b_block_buf,
+                        const BBlockTransferStep& b_block_copy_step,
+                        CThreadBuffer& c_thread_buf,
+                        index_t num_loop) const
+    {
+        ignore = b_block_buf;
+        __builtin_amdgcn_sched_barrier(0);
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        auto b_thread_dequant_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
+        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0);
+
+        StaticallyIndexedArray<decltype(b_thread_dequant_buf), Number<2>{}> b_thread_dequant_bufs;
+
+        // Global prefetch A1 B1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        b_blockwise_copy.Run(b_grid_desc,
+                             b_grid_buf,
+                             b_block_desc_n0_n1_k0_k1,
+                             b_block_origin_idx,
+                             b_thread_bufs(I0));
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        __builtin_amdgcn_sched_barrier(0);
+
+        // // Local prefill A1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
+
+        // // Global prefetch A2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+        // Local prefetch A1
+        block_sync_lds();
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                   make_tuple(m0, I0, I0, k0, I0, I0),
+                                   a_block_buf,
+                                   a_thread_desc_,
+                                   make_tuple(m0, I0, I0, k0, I0, I0),
+                                   a_thread_buf);
+            });
+        });
+        // B VGPR->VGPR dequant
+        b_thread_dequant_copy_.Run(b_block_desc_n0_n1_k0_k1,
+                                   b_block_origin_idx,
+                                   b_thread_bufs(I0),
+                                   b_thread_desc_,
+                                   make_tuple(I0, I0, I0, I0),
+                                   b_thread_dequant_bufs(I0));
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc_n0_n1_k0_k1,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(local_read_buf));
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    block_sync_lds();
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, mfma_reg_buf);
+
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, local_read_buf);
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                vector_type<ComputeDataType, KPack> a_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                                static_for<0, KPack, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                            make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_dequant_bufs[mfma_reg_buf]
+                                                             [Number<b_thread_desc_.CalculateOffset(
+                                                                 make_tuple(n0, I0, k0, ik))>{}];
+                                });
+                                using mfma_input_type =
+                                    typename vector_type<ComputeDataType,
+                                                         xdlops_gemm.K1PerXdlops>::type;
+
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                                xdlops_gemm.Run(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec.template AsType<mfma_input_type>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                            });
+                        });
+                    });
+
+                    block_sync_lds();
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                               make_tuple(m0, I0, I0, k0, I0, I0),
+                                               a_block_buf,
+                                               a_thread_desc_,
+                                               make_tuple(m0, I0, I0, k0, I0, I0),
+                                               a_thread_buf);
+                        });
+                    });
+                    // B VGPR->VGPR dequant
+                    b_thread_dequant_copy_.Run(b_block_desc_n0_n1_k0_k1,
+                                               b_block_origin_idx,
+                                               b_thread_bufs(local_read_buf),
+                                               b_thread_desc_,
+                                               make_tuple(I0, I0, I0, I0),
+                                               b_thread_dequant_bufs(local_read_buf));
+
+                    HotLoopScheduler();
+                    __builtin_amdgcn_sched_barrier(0);
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            b_blockwise_copy.Run(b_grid_desc,
+                                 b_grid_buf,
+                                 b_block_desc_n0_n1_k0_k1,
+                                 b_block_origin_idx,
+                                 b_thread_bufs(I1));
+
+            block_sync_lds();
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_dequant_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+
+            block_sync_lds();
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                       make_tuple(m0, I0, I0, k0, I0, I0),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, I0, k0, I0, I0),
+                                       a_thread_buf);
+                });
+            });
+            // B VGPR->VGPR dequant
+            b_thread_dequant_copy_.Run(b_block_desc_n0_n1_k0_k1,
+                                       b_block_origin_idx,
+                                       b_thread_bufs(I1),
+                                       b_thread_desc_,
+                                       make_tuple(I0, I0, I0, I0),
+                                       b_thread_dequant_bufs(I1));
+
+            __builtin_amdgcn_sched_barrier(0);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_dequant_bufs[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+            // Let's leak last MFMA block to epilogue region, cover the potential lds-shuffle
+            // latency
+            // __builtin_amdgcn_sched_barrier(0);
+        }
+        else
+        {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_dequant_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+        }
+    }
+
+    protected:
+    // MRepeat MWave MLane KRepeat KLane KPack
+    // KRepeat -> MRepeat-> Mwave->KLane->MLane->KPack
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, I1, I1, Number<KRepeat>{}, I1, Number<KPack>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                                         ComputeDataType,
+                                                         decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, 1, 1, KPack>,
+                                                         Sequence<0, 1, 2, 3, 4, 5>,
+                                                         5,
+                                                         A_K1,
+                                                         A_K1>;
+
+    AThreadCopy a_thread_copy_{Base::CalculateAThreadOriginDataIndex6D()};
+
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}));
+
+    static constexpr BTileDesc b_block_desc_n0_n1_k0_k1;
+
+    using Base::c_thread_desc_;
+
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+    using BThreadDequantCopy = ThreadwiseTensorSliceTransfer_StaticToStatic<
+        BDataType,
+        ComputeDataType,
+        decltype(b_block_desc_n0_n1_k0_k1),
+        decltype(b_block_desc_n0_n1_k0_k1),
+        tensor_operation::element_wise::PassThrough,
+        Sequence<Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}>,
+        Sequence<1, 2, 0, 3>,
+        3,
+        KPack>;
+
+    const PassThrough b_element_op{};
+    BThreadDequantCopy b_thread_dequant_copy_{b_element_op};
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v3.hpp
new file mode 100644
index 0000000000..4be4e321d3
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v3.hpp
@@ -0,0 +1,930 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Compute optimized pipeline
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPacks>
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v3
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v3<BlockGemmPipelineScheduler::Intrawave,
+                                                            BlockSize,
+                                                            ADataType,
+                                                            BDataType,
+                                                            ComputeDataType,
+                                                            AccDataType,
+                                                            ATileDesc,
+                                                            BTileDesc,
+                                                            AMmaTileDesc,
+                                                            BMmaTileDesc,
+                                                            ABlockTransferSrcScalarPerVector,
+                                                            BBlockTransferSrcScalarPerVector,
+                                                            MPerBlock,
+                                                            NPerBlock,
+                                                            KPerBlock,
+                                                            MPerXDL,
+                                                            NPerXDL,
+                                                            MRepeat,
+                                                            NRepeat,
+                                                            KPack>
+    : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                        ADataType,
+                                        BDataType,
+                                        ComputeDataType,
+                                        AccDataType,
+                                        ATileDesc,
+                                        BTileDesc,
+                                        AMmaTileDesc,
+                                        BMmaTileDesc,
+                                        ABlockTransferSrcScalarPerVector,
+                                        BBlockTransferSrcScalarPerVector,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        MPerXDL,
+                                        NPerXDL,
+                                        MRepeat,
+                                        NRepeat,
+                                        KPack>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack>;
+    using Base::A_K1;
+    using Base::B_K1;
+    using Base::I0;
+    using Base::I1;
+    using Base::I2;
+    using Base::KRepeat;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::AMmaKStride;
+    using Base::BMmaKStride;
+
+    using Base::MWaves;
+
+    static constexpr index_t PrefetchStages        = 2;
+    static constexpr index_t PrefillStages         = 1;
+    static constexpr index_t GlobalBufferNum       = 1;
+    static constexpr index_t HotloopLocalBufSwitch = MRepeat % 2 == 0 ? 0 : 1;
+
+    template <typename TileDesc_M0_M1_M2_K>
+    __host__ __device__ static constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K&)
+    {
+        constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
+        constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
+        constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
+        constexpr index_t K2 = KPack;
+        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K0 = KRepeat;
+
+        return transform_tensor_descriptor(
+            TileDesc_M0_M1_M2_K{},
+            make_tuple(
+                make_pass_through_transform(Number<M0>{}),
+                make_pass_through_transform(Number<M1>{}),
+                make_pass_through_transform(Number<M2>{}),
+                make_unmerge_transform(make_tuple(Number<K0>{}, Number<K1>{}, Number<K2>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}));
+    }
+
+    static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2 =
+        MakeAGemmMmaTileDescriptor(a_block_desc_m0_m1_m2_k);
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    template <typename Stage>
+    __device__ static constexpr auto HotLoopScheduler(Stage stage)
+    {
+        constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
+        constexpr auto num_ds_write_inst_a    = HotLoopInstList::A_LDS_Write_Inst_Num;
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = MWaves * HotLoopInstList::B_Buffer_Load_Inst_Num;
+
+        constexpr auto num_mfma = HotLoopInstList::C_MFMA_Inst_Num;
+
+        constexpr auto staged_num_ds_read_inst_a =
+            ck::math::integer_divide_ceil(num_ds_read_inst_a, MRepeat);
+        constexpr auto staged_num_mfma = ck::math::integer_divide_ceil(num_mfma, MRepeat);
+
+        constexpr auto staged_num_mfma_per_ds_read_a =
+            ck::math::integer_divide_ceil(staged_num_mfma, staged_num_ds_read_inst_a);
+
+        if constexpr(stage.value == 0)
+        {
+            constexpr auto staged_num_buffer_load_b_per_ds_read_a =
+                ck::math::integer_divide_ceil(num_buffer_load_inst_b, staged_num_ds_read_inst_a);
+            constexpr auto staged_num_mfma_per_buffer_load_b =
+                ck::math::integer_divide_ceil(staged_num_mfma, num_buffer_load_inst_b);
+            // B global
+            static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
+                ignore = i_inst;
+
+                static_for<0, staged_num_buffer_load_b_per_ds_read_a - 1, 1>{}([&](auto ibuf_inst) {
+                    ignore = ibuf_inst;
+                    __builtin_amdgcn_sched_group_barrier(
+                        0x008, staged_num_mfma_per_buffer_load_b, 0);  // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                });
+
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(
+                    0x008, staged_num_mfma_per_buffer_load_b - 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
+            });
+
+            __builtin_amdgcn_sched_barrier(0);
+        }
+        else if constexpr(stage.value == 1)
+        {
+            constexpr auto staged_num_mfma_per_ds_write_a =
+                math::integer_divide_ceil(staged_num_mfma, num_ds_write_inst_a);
+
+            constexpr auto stage_more_mfma =
+                staged_num_mfma - (staged_num_mfma_per_ds_write_a - 1) * num_ds_write_inst_a;
+
+            // A local write
+            static_for<0, num_ds_write_inst_a, 1>{}([&](auto i_inst) {
+                if constexpr(i_inst.value < stage_more_mfma)
+                {
+                    if(i_inst.value < staged_num_ds_read_inst_a)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    }
+                    else
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a, 0);     // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                    }
+                }
+                else
+                {
+                    if(i_inst.value < staged_num_ds_read_inst_a)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a - 2, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    }
+                    else
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                    }
+                }
+            });
+
+            __builtin_amdgcn_sched_barrier(0);
+        }
+        else if constexpr(stage.value == 2)
+        {
+            constexpr auto staged_num_mfma_per_buffer_load_a =
+                math::integer_divide_ceil(staged_num_mfma, num_buffer_load_inst_a);
+
+            constexpr auto stage_more_mfma =
+                staged_num_mfma - (staged_num_mfma_per_buffer_load_a - 1) * num_buffer_load_inst_a;
+
+            // A global
+            static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i_inst) {
+                if constexpr(i_inst.value < stage_more_mfma)
+                {
+                    if(i_inst.value < staged_num_ds_read_inst_a)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_buffer_load_a - 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);    // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);    // DS read
+                    }
+                    else
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_buffer_load_a, 0);  // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                    }
+                }
+                else
+                {
+                    if(i_inst.value < staged_num_ds_read_inst_a)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_buffer_load_a - 2, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);    // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);    // DS read
+                    }
+                    else
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_buffer_load_a - 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
+                    }
+                }
+            });
+
+            __builtin_amdgcn_sched_barrier(0);
+        }
+        else
+        {
+            // A local Read
+            static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
+                ignore = i_inst;
+                __builtin_amdgcn_sched_group_barrier(
+                    0x008, staged_num_mfma_per_ds_read_a, 0);      // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            });
+
+            __builtin_amdgcn_sched_barrier(0);
+        }
+    }
+
+    template <typename Stage>
+    __device__ static constexpr auto EpilogueScheduler_1(Stage stage)
+    {
+        constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
+        constexpr auto num_ds_write_inst_a    = HotLoopInstList::A_LDS_Write_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = MWaves * HotLoopInstList::B_Buffer_Load_Inst_Num;
+
+        constexpr auto num_mfma = HotLoopInstList::C_MFMA_Inst_Num;
+
+        constexpr auto staged_num_ds_read_inst_a = num_ds_read_inst_a / MRepeat;
+        constexpr auto staged_num_mfma           = num_mfma / MRepeat;
+
+        constexpr auto staged_num_mfma_per_ds_read_a = staged_num_mfma / staged_num_ds_read_inst_a;
+
+        if constexpr(stage.value == 0)
+        {
+            constexpr auto staged_num_buffer_load_b_per_ds_read_a =
+                num_buffer_load_inst_b / staged_num_ds_read_inst_a;
+            constexpr auto staged_num_mfma_per_buffer_load_b =
+                staged_num_mfma / num_buffer_load_inst_b;
+            // B global
+            static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
+                ignore = i_inst;
+
+                static_for<0, staged_num_buffer_load_b_per_ds_read_a, 1>{}([&](auto ibuf_inst) {
+                    ignore = ibuf_inst;
+                    __builtin_amdgcn_sched_group_barrier(
+                        0x008, staged_num_mfma_per_buffer_load_b, 0);  // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                });
+
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(
+                    0x008, staged_num_mfma_per_buffer_load_b - 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
+            });
+
+            __builtin_amdgcn_sched_barrier(0);
+        }
+        else if constexpr(stage.value == 1)
+        {
+#if 0
+            constexpr auto staged_num_ds_write_a_per_ds_read_a =
+                num_ds_write_inst_a / staged_num_ds_read_inst_a;
+            constexpr auto staged_num_mfma_per_ds_write_a = staged_num_mfma / num_ds_write_inst_a;
+            // A local write
+            static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
+                ignore = i_inst;
+
+                static_for<0, staged_num_ds_write_a_per_ds_read_a, 1>{}([&](auto idswrite_inst) {
+                    ignore = idswrite_inst;
+                    __builtin_amdgcn_sched_group_barrier(
+                        0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                });
+
+                __builtin_amdgcn_sched_group_barrier(
+                    0x008, staged_num_ds_write_a_per_ds_read_a, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);  // DS read
+            });
+#elif 1
+            constexpr auto staged_num_mfma_per_ds_write_a =
+                math::integer_divide_ceil(staged_num_mfma, num_ds_write_inst_a);
+
+            constexpr auto stage_more_mfma =
+                staged_num_mfma - (staged_num_mfma_per_ds_write_a - 1) * num_ds_write_inst_a;
+
+            // A local write
+            static_for<0, num_ds_write_inst_a, 1>{}([&](auto i_inst) {
+                if constexpr(i_inst.value < stage_more_mfma)
+                {
+                    if(i_inst.value < staged_num_ds_read_inst_a)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    }
+                    else
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a, 0);     // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                    }
+                }
+                else
+                {
+                    if(i_inst.value < staged_num_ds_read_inst_a)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a - 2, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    }
+                    else
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                    }
+                }
+            });
+#endif
+            __builtin_amdgcn_sched_barrier(0);
+        }
+        else
+        {
+            // A local Read
+            static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
+                ignore = i_inst;
+                __builtin_amdgcn_sched_group_barrier(
+                    0x008, staged_num_mfma_per_ds_read_a, 0);      // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            });
+
+            __builtin_amdgcn_sched_barrier(0);
+        }
+    }
+
+    __device__ static constexpr auto EpilogueScheduler_2()
+    {
+        constexpr auto num_ds_read_inst_a = HotLoopInstList::A_LDS_Read_Inst_Num;
+
+        constexpr auto num_mfma = HotLoopInstList::C_MFMA_Inst_Num;
+
+        constexpr auto staged_num_ds_read_inst_a = num_ds_read_inst_a / MRepeat;
+        constexpr auto staged_num_mfma           = num_mfma / MRepeat;
+
+        constexpr auto staged_num_mfma_per_ds_read_a = staged_num_mfma / staged_num_ds_read_inst_a;
+
+        // A local Read
+        static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
+            ignore = i_inst;
+            __builtin_amdgcn_sched_group_barrier(0x008, staged_num_mfma_per_ds_read_a, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+        });
+
+        __builtin_amdgcn_sched_barrier(0);
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer>
+    __device__ void Run(const AGridDesc& a_grid_desc,
+                        const ABlockDesc& a_block_desc,
+                        ABlockTransfer& a_blockwise_copy,
+                        const AGridBuffer& a_grid_buf,
+                        ABlockBuffer& a_block_buf,
+                        const ABlockTransferStep& a_block_copy_step,
+                        const BGridDesc& b_grid_desc,
+                        BBlockTransfer& b_blockwise_copy,
+                        const BGridBuffer& b_grid_buf,
+                        BBlockBuffer& b_block_buf,
+                        const BBlockTransferStep& b_block_copy_step,
+                        CThreadBuffer& c_thread_buf,
+                        index_t num_loop) const
+    {
+        ignore = b_block_buf;
+        __builtin_amdgcn_sched_barrier(0);
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+        auto b_thread_dequant_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
+        StaticallyIndexedArray<decltype(b_thread_dequant_buf), Number<2>{}> b_thread_dequant_bufs;
+        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0);
+
+        // Global prefetch A1 B1
+        b_blockwise_copy.Run(b_grid_desc,
+                             b_grid_buf,
+                             b_block_desc_n0_n1_k0_k1,
+                             b_block_origin_idx,
+                             b_thread_bufs(I0));
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        __builtin_amdgcn_sched_barrier(0);
+
+        // // Local prefill A1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I0));
+
+        // // Global prefetch A2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+        // Local prefetch A1
+        block_sync_lds();
+        static_for<0, KRepeat, 1>{}([&](auto k0) {
+            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                               make_tuple(I0, I0, I0, k0, I0, I0),
+                               a_block_buf.At(I0),
+                               a_thread_desc_,
+                               make_tuple(I0, I0, I0, k0, I0, I0),
+                               a_thread_buf);
+        });
+        // B VGPR->VGPR dequant
+        b_thread_dequant_copy_.Run(b_block_desc_n0_n1_k0_k1,
+                                   b_block_origin_idx,
+                                   b_thread_bufs(I0),
+                                   b_thread_desc_,
+                                   make_tuple(I0, I0, I0, I0),
+                                   b_thread_dequant_bufs(I0));
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        if constexpr(m0.value == 0)
+                        {
+                            b_blockwise_copy.Run(b_grid_desc,
+                                                 b_grid_buf,
+                                                 b_block_desc_n0_n1_k0_k1,
+                                                 b_block_origin_idx,
+                                                 b_thread_bufs(local_read_buf));
+                            b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+                        }
+                        else if constexpr(m0.value == 1)
+                        {
+                            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(local_read_buf));
+                        }
+                        else if constexpr(m0.value == 2)
+                        {
+                            a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+                            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                        }
+
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                                vector_type<ComputeDataType, KPack> a_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                                static_for<0, KPack, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                            make_tuple((m0 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                           2,
+                                                       I0,
+                                                       I0,
+                                                       k0,
+                                                       I0,
+                                                       ik))>{}];
+                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_dequant_bufs[mfma_reg_buf]
+                                                             [Number<b_thread_desc_.CalculateOffset(
+                                                                 make_tuple(n0, I0, k0, ik))>{}];
+                                });
+
+                                using mfma_input_type =
+                                    typename vector_type<ComputeDataType,
+                                                         xdlops_gemm.K1PerXdlops>::type;
+
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                                xdlops_gemm.Run(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec.template AsType<mfma_input_type>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                            });
+                        });
+
+                        if constexpr(m0.value == MRepeat - 1)
+                        {
+                            block_sync_lds();
+
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                a_thread_copy_.Run(
+                                    a_block_desc_m0_m1_m2_k0_k1_k2,
+                                    make_tuple(Number<(m0 + 1) % MRepeat>{}, I0, I0, k0, I0, I0),
+                                    a_block_buf.At(local_read_buf),
+                                    a_thread_desc_,
+                                    make_tuple(
+                                        Number<(m0 + 1 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                               2>{},
+                                        I0,
+                                        I0,
+                                        k0,
+                                        I0,
+                                        I0),
+                                    a_thread_buf);
+                            });
+                            // B VGPR->VGPR dequant
+                            b_thread_dequant_copy_.Run(b_block_desc_n0_n1_k0_k1,
+                                                       b_block_origin_idx,
+                                                       b_thread_bufs(local_read_buf),
+                                                       b_thread_desc_,
+                                                       make_tuple(I0, I0, I0, I0),
+                                                       b_thread_dequant_bufs(local_read_buf));
+                        }
+                        else
+                        {
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                a_thread_copy_.Run(
+                                    a_block_desc_m0_m1_m2_k0_k1_k2,
+                                    make_tuple(Number<(m0 + 1) % MRepeat>{}, I0, I0, k0, I0, I0),
+                                    a_block_buf.At(mfma_reg_buf),
+                                    a_thread_desc_,
+                                    make_tuple(
+                                        Number<(m0 + 1 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                               2>{},
+                                        I0,
+                                        I0,
+                                        k0,
+                                        I0,
+                                        I0),
+                                    a_thread_buf);
+                            });
+                            // B VGPR->VGPR dequant
+                            b_thread_dequant_copy_.Run(b_block_desc_n0_n1_k0_k1,
+                                                       b_block_origin_idx,
+                                                       b_thread_bufs(mfma_reg_buf),
+                                                       b_thread_desc_,
+                                                       make_tuple(I0, I0, I0, I0),
+                                                       b_thread_dequant_bufs(mfma_reg_buf));
+                        }
+
+                        HotLoopScheduler(m0);
+                    });
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                if constexpr(m0.value == 0)
+                {
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc_n0_n1_k0_k1,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(I1));
+                }
+                else if constexpr(m0.value == MRepeat - 1)
+                {
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I1));
+                }
+
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0 % 2, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_dequant_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+
+                if constexpr(m0.value == MRepeat - 1)
+                {
+                    block_sync_lds();
+
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        a_thread_copy_.Run(
+                            a_block_desc_m0_m1_m2_k0_k1_k2,
+                            make_tuple(Number<(m0 + 1) % MRepeat>{}, I0, I0, k0, I0, I0),
+                            a_block_buf.At(I1),
+                            a_thread_desc_,
+                            make_tuple(Number<(m0 + 1) % 2>{}, I0, I0, k0, I0, I0),
+                            a_thread_buf);
+                    });
+                    // B VGPR->VGPR dequant
+                    b_thread_dequant_copy_.Run(b_block_desc_n0_n1_k0_k1,
+                                               b_block_origin_idx,
+                                               b_thread_bufs(I1),
+                                               b_thread_desc_,
+                                               make_tuple(I0, I0, I0, I0),
+                                               b_thread_dequant_bufs(I1));
+                }
+                else
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        a_thread_copy_.Run(
+                            a_block_desc_m0_m1_m2_k0_k1_k2,
+                            make_tuple(Number<(m0 + 1) % MRepeat>{}, I0, I0, k0, I0, I0),
+                            a_block_buf.At(I0),
+                            a_thread_desc_,
+                            make_tuple(Number<(m0 + 1) % 2>{}, I0, I0, k0, I0, I0),
+                            a_thread_buf);
+                    });
+                    // B VGPR->VGPR dequant
+                    b_thread_dequant_copy_.Run(b_block_desc_n0_n1_k0_k1,
+                                               b_block_origin_idx,
+                                               b_thread_bufs(I0),
+                                               b_thread_desc_,
+                                               make_tuple(I0, I0, I0, I0),
+                                               b_thread_dequant_bufs(I0));
+                }
+
+                EpilogueScheduler_1(m0);
+            });
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(make_tuple(
+                                    (m0 + HotloopLocalBufSwitch) % 2, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_dequant_bufs[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+
+                if constexpr(m0.value != (MRepeat - 1))
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        a_thread_copy_.Run(
+                            a_block_desc_m0_m1_m2_k0_k1_k2,
+                            make_tuple(Number<m0 + 1>{}, I0, I0, k0, I0, I0),
+                            a_block_buf.At(I1),
+                            a_thread_desc_,
+                            make_tuple(
+                                Number<(m0 + 1 + HotloopLocalBufSwitch) % 2>{}, I0, I0, k0, I0, I0),
+                            a_thread_buf);
+                    });
+                    // B VGPR->VGPR dequant
+                    b_thread_dequant_copy_.Run(b_block_desc_n0_n1_k0_k1,
+                                               b_block_origin_idx,
+                                               b_thread_bufs(I1),
+                                               b_thread_desc_,
+                                               make_tuple(I0, I0, I0, I0),
+                                               b_thread_dequant_bufs(I1));
+
+                    EpilogueScheduler_2();
+                }
+            });
+            // Let's leak last MFMA block to epilogue region, cover the potential lds-shuffle
+            // latency
+            // __builtin_amdgcn_sched_barrier(0);
+        }
+        else
+        {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0 % 2, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_dequant_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+
+                if constexpr(m0.value != (MRepeat - 1))
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                           make_tuple(Number<m0 + 1>{}, I0, I0, k0, I0, I0),
+                                           a_block_buf.At(I0),
+                                           a_thread_desc_,
+                                           make_tuple(Number<(m0 + 1) % 2>{}, I0, I0, k0, I0, I0),
+                                           a_thread_buf);
+                    });
+                    // B VGPR->VGPR dequant
+                    b_thread_dequant_copy_.Run(b_block_desc_n0_n1_k0_k1,
+                                               b_block_origin_idx,
+                                               b_thread_bufs(I0),
+                                               b_thread_desc_,
+                                               make_tuple(I0, I0, I0, I0),
+                                               b_thread_dequant_bufs(I0));
+
+                    EpilogueScheduler_2();
+                }
+            });
+        }
+    }
+
+    protected:
+    // MRepeat MWave MLane KRepeat KLane KPack
+    // KRepeat -> MRepeat-> Mwave->KLane->MLane->KPack
+    // Reduce the vgpr usage here.
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(I2, I1, I1, Number<KRepeat>{}, I1, Number<KPack>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                                         ComputeDataType,
+                                                         decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, 1, 1, KPack>,
+                                                         Sequence<0, 1, 2, 3, 4, 5>,
+                                                         5,
+                                                         A_K1,
+                                                         A_K1>;
+
+    AThreadCopy a_thread_copy_{Base::CalculateAThreadOriginDataIndex6D()};
+
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}));
+
+    static constexpr BTileDesc b_block_desc_n0_n1_k0_k1;
+
+    using Base::c_thread_desc_;
+
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+    using BThreadDequantCopy = ThreadwiseTensorSliceTransfer_StaticToStatic<
+        BDataType,
+        ComputeDataType,
+        decltype(b_block_desc_n0_n1_k0_k1),
+        decltype(b_block_desc_n0_n1_k0_k1),
+        tensor_operation::element_wise::PassThrough,
+        Sequence<Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}>,
+        Sequence<1, 2, 0, 3>,
+        3,
+        KPack>;
+
+    const PassThrough b_element_op{};
+    BThreadDequantCopy b_thread_dequant_copy_{b_element_op};
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp
new file mode 100644
index 0000000000..29750b8baa
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp
@@ -0,0 +1,621 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Compute optimized pipeline
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPacks>
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1<
+    BlockGemmPipelineScheduler::Intrawave,
+    BlockSize,
+    ADataType,
+    BDataType,
+    ComputeDataType,
+    AccDataType,
+    ATileDesc,
+    BTileDesc,
+    AMmaTileDesc,
+    BMmaTileDesc,
+    ABlockTransferSrcScalarPerVector,
+    BBlockTransferSrcScalarPerVector,
+    MPerBlock,
+    NPerBlock,
+    KPerBlock,
+    MPerXDL,
+    NPerXDL,
+    MRepeat,
+    NRepeat,
+    KPack> : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                               ADataType,
+                                               BDataType,
+                                               ComputeDataType,
+                                               AccDataType,
+                                               ATileDesc,
+                                               BTileDesc,
+                                               AMmaTileDesc,
+                                               BMmaTileDesc,
+                                               ABlockTransferSrcScalarPerVector,
+                                               BBlockTransferSrcScalarPerVector,
+                                               MPerBlock,
+                                               NPerBlock,
+                                               KPerBlock,
+                                               MPerXDL,
+                                               NPerXDL,
+                                               MRepeat,
+                                               NRepeat,
+                                               KPack>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack>;
+    using Base::A_K1;
+    using Base::B_K1;
+    using Base::I0;
+    using Base::I1;
+    using Base::KRepeat;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::AMmaKStride;
+    using Base::BMmaKStride;
+    using Base::c_thread_desc_;
+
+    static constexpr index_t PrefetchStages  = 2;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 2;
+
+    template <typename TileDesc_M0_M1_M2_K>
+    __host__ __device__ static constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K&)
+    {
+        constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
+        constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
+        constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
+        constexpr index_t K2 = KPack;
+        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K0 = KRepeat;
+
+        return transform_tensor_descriptor(
+            TileDesc_M0_M1_M2_K{},
+            make_tuple(
+                make_pass_through_transform(Number<M0>{}),
+                make_pass_through_transform(Number<M1>{}),
+                make_pass_through_transform(Number<M2>{}),
+                make_unmerge_transform(make_tuple(Number<K0>{}, Number<K1>{}, Number<K2>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}));
+    }
+
+    static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2 =
+        MakeAGemmMmaTileDescriptor(a_block_desc_m0_m1_m2_k);
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
+
+        // B global
+        static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+        });
+
+        // A global
+        static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+        });
+
+        // A local
+        static_for<0, num_ds_read_inst_a / 2, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS read
+        });
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer>
+    __device__ void Run(const AGridDesc& a_grid_desc,
+                        const ABlockDesc& a_block_desc,
+                        ABlockTransfer& a_blockwise_copy,
+                        const AGridBuffer& a_grid_buf,
+                        ABlockBuffer& a_block_buf,
+                        const ABlockTransferStep& a_block_copy_step,
+                        const BGridDesc& b_grid_desc,
+                        BBlockTransfer& b_blockwise_copy,
+                        BBlockTransfer& b_blockwise_copy_up,
+                        const BGridBuffer& b_grid_buf,
+                        const BGridBuffer& b_grid_buf_up,
+                        BBlockBuffer& b_block_buf,
+                        const BBlockTransferStep& b_block_copy_step,
+                        CThreadBuffer& c_thread_buf,
+                        CThreadBuffer& c_thread_buf_up,
+                        index_t num_loop) const
+
+    {
+        ignore = b_block_buf;
+        __builtin_amdgcn_sched_barrier(0);
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        auto b_thread_dequant_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs_up;
+        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0);
+
+        StaticallyIndexedArray<decltype(b_thread_dequant_buf), Number<2>{}> b_thread_dequant_bufs;
+        StaticallyIndexedArray<decltype(b_thread_dequant_buf), Number<2>{}>
+            b_thread_dequant_bufs_up;
+
+        // Global prefetch A1 B1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        b_blockwise_copy.Run(b_grid_desc,
+                             b_grid_buf,
+                             b_block_desc_n0_n1_k0_k1,
+                             b_block_origin_idx,
+                             b_thread_bufs(I0));
+        b_blockwise_copy_up.Run(b_grid_desc,
+                                b_grid_buf_up,
+                                b_block_desc_n0_n1_k0_k1,
+                                b_block_origin_idx,
+                                b_thread_bufs_up(I0));
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        __builtin_amdgcn_sched_barrier(0);
+
+        // // Local prefill A1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
+
+        // // Global prefetch A2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+        // Local prefetch A1
+        block_sync_lds();
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                   make_tuple(m0, I0, I0, k0, I0, I0),
+                                   a_block_buf,
+                                   a_thread_desc_,
+                                   make_tuple(m0, I0, I0, k0, I0, I0),
+                                   a_thread_buf);
+            });
+        });
+        // B VGPR->VGPR dequant
+        b_thread_dequant_copy_.Run(b_block_desc_n0_n1_k0_k1,
+                                   b_block_origin_idx,
+                                   b_thread_bufs(I0),
+                                   b_thread_desc_,
+                                   make_tuple(I0, I0, I0, I0),
+                                   b_thread_dequant_bufs(I0));
+        b_thread_dequant_copy_.Run(b_block_desc_n0_n1_k0_k1,
+                                   b_block_origin_idx,
+                                   b_thread_bufs_up(I0),
+                                   b_thread_desc_,
+                                   make_tuple(I0, I0, I0, I0),
+                                   b_thread_dequant_bufs_up(I0));
+
+        // Initialize C
+        c_thread_buf.Clear();
+        c_thread_buf_up.Clear();
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc_n0_n1_k0_k1,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(local_read_buf));
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+                    b_blockwise_copy_up.Run(b_grid_desc,
+                                            b_grid_buf_up,
+                                            b_block_desc_n0_n1_k0_k1,
+                                            b_block_origin_idx,
+                                            b_thread_bufs_up(local_read_buf));
+                    b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    block_sync_lds();
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, mfma_reg_buf);
+
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, local_read_buf);
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                vector_type<ComputeDataType, KPack> a_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                                static_for<0, KPack, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                            make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_dequant_bufs[mfma_reg_buf]
+                                                             [Number<b_thread_desc_.CalculateOffset(
+                                                                 make_tuple(n0, I0, k0, ik))>{}];
+                                    b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_dequant_bufs_up
+                                            [mfma_reg_buf][Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, k0, ik))>{}];
+                                });
+                                using mfma_input_type =
+                                    typename vector_type<ComputeDataType,
+                                                         xdlops_gemm.K1PerXdlops>::type;
+
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                                xdlops_gemm.Run(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec.template AsType<mfma_input_type>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+
+                                xdlops_gemm.Run(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec_up.template AsType<mfma_input_type>(),
+                                    c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                            });
+                        });
+                    });
+
+                    block_sync_lds();
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                               make_tuple(m0, I0, I0, k0, I0, I0),
+                                               a_block_buf,
+                                               a_thread_desc_,
+                                               make_tuple(m0, I0, I0, k0, I0, I0),
+                                               a_thread_buf);
+                        });
+                    });
+                    // B VGPR->VGPR dequant
+                    b_thread_dequant_copy_.Run(b_block_desc_n0_n1_k0_k1,
+                                               b_block_origin_idx,
+                                               b_thread_bufs(local_read_buf),
+                                               b_thread_desc_,
+                                               make_tuple(I0, I0, I0, I0),
+                                               b_thread_dequant_bufs(local_read_buf));
+                    b_thread_dequant_copy_.Run(b_block_desc_n0_n1_k0_k1,
+                                               b_block_origin_idx,
+                                               b_thread_bufs_up(local_read_buf),
+                                               b_thread_desc_,
+                                               make_tuple(I0, I0, I0, I0),
+                                               b_thread_dequant_bufs_up(local_read_buf));
+
+                    HotLoopScheduler();
+                    __builtin_amdgcn_sched_barrier(0);
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            b_blockwise_copy.Run(b_grid_desc,
+                                 b_grid_buf,
+                                 b_block_desc_n0_n1_k0_k1,
+                                 b_block_origin_idx,
+                                 b_thread_bufs(I1));
+
+            b_blockwise_copy_up.Run(b_grid_desc,
+                                    b_grid_buf_up,
+                                    b_block_desc_n0_n1_k0_k1,
+                                    b_block_origin_idx,
+                                    b_thread_bufs_up(I1));
+
+            block_sync_lds();
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_dequant_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                b_thread_dequant_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec_up.template AsType<mfma_input_type>(),
+                                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+
+            block_sync_lds();
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                       make_tuple(m0, I0, I0, k0, I0, I0),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, I0, k0, I0, I0),
+                                       a_thread_buf);
+                });
+            });
+            // B VGPR->VGPR dequant
+            b_thread_dequant_copy_.Run(b_block_desc_n0_n1_k0_k1,
+                                       b_block_origin_idx,
+                                       b_thread_bufs(I1),
+                                       b_thread_desc_,
+                                       make_tuple(I0, I0, I0, I0),
+                                       b_thread_dequant_bufs(I1));
+
+            b_thread_dequant_copy_.Run(b_block_desc_n0_n1_k0_k1,
+                                       b_block_origin_idx,
+                                       b_thread_bufs_up(I1),
+                                       b_thread_desc_,
+                                       make_tuple(I0, I0, I0, I0),
+                                       b_thread_dequant_bufs_up(I1));
+            __builtin_amdgcn_sched_barrier(0);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_dequant_bufs[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                b_thread_dequant_bufs_up[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec_up.template AsType<mfma_input_type>(),
+                                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+            // Let's leak last MFMA block to epilogue region, cover the potential lds-shuffle
+            // latency
+            // __builtin_amdgcn_sched_barrier(0);
+        }
+        else
+        {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_dequant_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                b_thread_dequant_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec_up.template AsType<mfma_input_type>(),
+                                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+        }
+    }
+
+    protected:
+    // MRepeat MWave MLane KRepeat KLane KPack
+    // KRepeat -> MRepeat-> Mwave->KLane->MLane->KPack
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, I1, I1, Number<KRepeat>{}, I1, Number<KPack>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                                         ComputeDataType,
+                                                         decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, 1, 1, KPack>,
+                                                         Sequence<0, 1, 2, 3, 4, 5>,
+                                                         5,
+                                                         A_K1,
+                                                         A_K1>;
+
+    AThreadCopy a_thread_copy_{Base::CalculateAThreadOriginDataIndex6D()};
+
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}));
+
+    static constexpr BTileDesc b_block_desc_n0_n1_k0_k1;
+
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+    using BThreadDequantCopy = ThreadwiseTensorSliceTransfer_StaticToStatic<
+        BDataType,
+        ComputeDataType,
+        decltype(b_block_desc_n0_n1_k0_k1),
+        decltype(b_block_desc_n0_n1_k0_k1),
+        tensor_operation::element_wise::PassThrough,
+        Sequence<Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}>,
+        Sequence<1, 2, 0, 3>,
+        3,
+        KPack>;
+
+    const PassThrough b_element_op{};
+    BThreadDequantCopy b_thread_dequant_copy_{b_element_op};
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v1.hpp
new file mode 100644
index 0000000000..73749c6309
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v1.hpp
@@ -0,0 +1,573 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Compute optimized pipeline
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPacks>
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v1
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v1<BlockGemmPipelineScheduler::Intrawave,
+                                                            BlockSize,
+                                                            ADataType,
+                                                            BDataType,
+                                                            ComputeDataType,
+                                                            AccDataType,
+                                                            ATileDesc,
+                                                            BTileDesc,
+                                                            AMmaTileDesc,
+                                                            BMmaTileDesc,
+                                                            ABlockTransferSrcScalarPerVector,
+                                                            BBlockTransferSrcScalarPerVector,
+                                                            MPerBlock,
+                                                            NPerBlock,
+                                                            KPerBlock,
+                                                            MPerXDL,
+                                                            NPerXDL,
+                                                            MRepeat,
+                                                            NRepeat,
+                                                            KPack>
+    : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                        ADataType,
+                                        BDataType,
+                                        ComputeDataType,
+                                        AccDataType,
+                                        ATileDesc,
+                                        BTileDesc,
+                                        AMmaTileDesc,
+                                        BMmaTileDesc,
+                                        ABlockTransferSrcScalarPerVector,
+                                        BBlockTransferSrcScalarPerVector,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        MPerXDL,
+                                        NPerXDL,
+                                        MRepeat,
+                                        NRepeat,
+                                        KPack>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack>;
+    using Base::A_K1;
+    using Base::B_K1;
+    using Base::I0;
+    using Base::I1;
+    using Base::KRepeat;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::AMmaKStride;
+    using Base::BMmaKStride;
+    using Base::c_thread_desc_;
+    using Base::MWaves;
+
+    static constexpr index_t PrefetchStages  = 2;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 2;
+
+    template <typename TileDesc_M0_M1_M2_K>
+    __host__ __device__ static constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K&)
+    {
+        constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
+        constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
+        constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
+        constexpr index_t K2 = KPack;
+        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K0 = KRepeat;
+
+        return transform_tensor_descriptor(
+            TileDesc_M0_M1_M2_K{},
+            make_tuple(
+                make_pass_through_transform(Number<M0>{}),
+                make_pass_through_transform(Number<M1>{}),
+                make_pass_through_transform(Number<M2>{}),
+                make_unmerge_transform(make_tuple(Number<K0>{}, Number<K1>{}, Number<K2>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}));
+    }
+
+    static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2 =
+        MakeAGemmMmaTileDescriptor(a_block_desc_m0_m1_m2_k);
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b =
+            HotLoopInstList::B_Buffer_Load_Inst_Num * MWaves * 2;
+        constexpr auto mfma_interleave = MPerXDL == 32 ? 1 : 2;
+        // B global
+        static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
+            ignore = i;
+            if constexpr(MPerBlock >= 128 && NPerBlock >= 64)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x008, 2 * mfma_interleave, 0);
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x008, mfma_interleave, 0);
+            }
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            // if constexpr(i.value < num_buffer_load_inst_a) {
+            //     __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            //     __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+            //     __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            //     __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            // }
+        });
+
+        // A global
+        static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+        });
+
+        // A local
+        static_for<0, MPerXDL == 32 ? num_ds_read_inst_a / 2 : num_ds_read_inst_a, 1>{}(
+            [&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);                     // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, MPerXDL == 32 ? 2 : 1, 0); // DS read
+            });
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer>
+    __device__ void Run(const AGridDesc& a_grid_desc,
+                        const ABlockDesc& a_block_desc,
+                        ABlockTransfer& a_blockwise_copy,
+                        const AGridBuffer& a_grid_buf,
+                        ABlockBuffer& a_block_buf,
+                        const ABlockTransferStep& a_block_copy_step,
+                        const BGridDesc& b_grid_desc,
+                        BBlockTransfer& b_blockwise_copy,
+                        BBlockTransfer& b_blockwise_copy_up,
+                        const BGridBuffer& b_grid_buf,
+                        const BGridBuffer& b_grid_buf_up,
+                        BBlockBuffer& b_block_buf,
+                        const BBlockTransferStep& b_block_copy_step,
+                        CThreadBuffer& c_thread_buf,
+                        CThreadBuffer& c_thread_buf_up,
+                        index_t num_loop) const
+    {
+        ignore = b_block_buf;
+        __builtin_amdgcn_sched_barrier(0);
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs_up;
+        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0);
+
+        // Global prefetch A1 B1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        b_blockwise_copy.Run(b_grid_desc,
+                             b_grid_buf,
+                             b_block_desc_n0_n1_k0_k1,
+                             b_block_origin_idx,
+                             b_thread_bufs(I0));
+        b_blockwise_copy_up.Run(b_grid_desc,
+                                b_grid_buf_up,
+                                b_block_desc_n0_n1_k0_k1,
+                                b_block_origin_idx,
+                                b_thread_bufs_up(I0));
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        __builtin_amdgcn_sched_barrier(0);
+
+        // // Local prefill A1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
+
+        // // Global prefetch A2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+        // Local prefetch A1
+        block_sync_lds();
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                   make_tuple(m0, I0, I0, k0, I0, I0),
+                                   a_block_buf,
+                                   a_thread_desc_,
+                                   make_tuple(m0, I0, I0, k0, I0, I0),
+                                   a_thread_buf);
+            });
+        });
+
+        // Initialize C
+        c_thread_buf.Clear();
+        c_thread_buf_up.Clear();
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc_n0_n1_k0_k1,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(local_read_buf));
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    b_blockwise_copy_up.Run(b_grid_desc,
+                                            b_grid_buf_up,
+                                            b_block_desc_n0_n1_k0_k1,
+                                            b_block_origin_idx,
+                                            b_thread_bufs_up(local_read_buf));
+                    b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+                    block_sync_lds();
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, mfma_reg_buf);
+
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, local_read_buf);
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                vector_type<ComputeDataType, KPack> a_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                                static_for<0, KPack, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                            make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_bufs[mfma_reg_buf]
+                                                     [Number<b_thread_desc_.CalculateOffset(
+                                                         make_tuple(n0, I0, k0, ik))>{}];
+                                    b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_bufs_up[mfma_reg_buf]
+                                                        [Number<b_thread_desc_.CalculateOffset(
+                                                            make_tuple(n0, I0, k0, ik))>{}];
+                                });
+                                using mfma_input_type =
+                                    typename vector_type<ComputeDataType,
+                                                         xdlops_gemm.K1PerXdlops>::type;
+
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                                xdlops_gemm.Run(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec.template AsType<mfma_input_type>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+
+                                xdlops_gemm.Run(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec_up.template AsType<mfma_input_type>(),
+                                    c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                            });
+                        });
+                    });
+
+                    block_sync_lds();
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                               make_tuple(m0, I0, I0, k0, I0, I0),
+                                               a_block_buf,
+                                               a_thread_desc_,
+                                               make_tuple(m0, I0, I0, k0, I0, I0),
+                                               a_thread_buf);
+                        });
+                    });
+
+                    HotLoopScheduler();
+                    __builtin_amdgcn_sched_barrier(0);
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            b_blockwise_copy.Run(b_grid_desc,
+                                 b_grid_buf,
+                                 b_block_desc_n0_n1_k0_k1,
+                                 b_block_origin_idx,
+                                 b_thread_bufs(I1));
+
+            b_blockwise_copy_up.Run(b_grid_desc,
+                                    b_grid_buf_up,
+                                    b_block_desc_n0_n1_k0_k1,
+                                    b_block_origin_idx,
+                                    b_thread_bufs_up(I1));
+            block_sync_lds();
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec_up.template AsType<mfma_input_type>(),
+                                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+
+            block_sync_lds();
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                       make_tuple(m0, I0, I0, k0, I0, I0),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, I0, k0, I0, I0),
+                                       a_thread_buf);
+                });
+            });
+
+            __builtin_amdgcn_sched_barrier(0);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs_up[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec_up.template AsType<mfma_input_type>(),
+                                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+            // Let's leak last MFMA block to epilogue region, cover the potential lds-shuffle
+            // latency
+            // __builtin_amdgcn_sched_barrier(0);
+        }
+        else if constexpr(TailNum == TailNumber::Odd)
+        {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec_up.template AsType<mfma_input_type>(),
+                                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+        }
+    }
+
+    protected:
+    // MRepeat MWave MLane KRepeat KLane KPack
+    // KRepeat -> MRepeat-> Mwave->KLane->MLane->KPack
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, I1, I1, Number<KRepeat>{}, I1, Number<KPack>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                                         ComputeDataType,
+                                                         decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, 1, 1, KPack>,
+                                                         Sequence<0, 1, 2, 3, 4, 5>,
+                                                         5,
+                                                         A_K1,
+                                                         A_K1>;
+
+    AThreadCopy a_thread_copy_{Base::CalculateAThreadOriginDataIndex6D()};
+
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}));
+
+    static constexpr BTileDesc b_block_desc_n0_n1_k0_k1;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp
index 9c450a9c41..074b5873ee 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp
@@ -1,11 +1,18 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v1.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v1.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v2.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v3.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v5.hpp"
+
 namespace ck {
 
 template <BlockGemmPipelineVersion BlkGemmPipelineVer,
@@ -28,31 +35,113 @@ template <BlockGemmPipelineVersion BlkGemmPipelineVer,
           index_t NPerXDL,
           index_t MRepeat,
           index_t NRepeat,
-          index_t KPack>
+          index_t KPack,
+          bool GUFusion = false>
 constexpr auto BlockGemmBPreshufflePipeline_Selector()
 {
     if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
     {
-        return BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlkGemmPipeSche,
-                                                           BlockSize,
-                                                           ADataType,
-                                                           BDataType,
-                                                           ComputeDataType,
-                                                           AccDataType,
-                                                           ATileDesc,
-                                                           BTileDesc,
-                                                           AMmaTileDesc,
-                                                           BMmaTileDesc,
-                                                           ABlockTransferSrcScalarPerVector,
-                                                           BBlockTransferSrcScalarPerVector,
-                                                           MPerBlock,
-                                                           NPerBlock,
-                                                           KPerBlock,
-                                                           MPerXDL,
-                                                           NPerXDL,
-                                                           MRepeat,
-                                                           NRepeat,
-                                                           KPack>{};
+        if constexpr(std::is_same<ADataType, BDataType>::value)
+        {
+            if constexpr(GUFusion)
+            {
+                return BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v1<
+                    BlkGemmPipeSche,
+                    BlockSize,
+                    ADataType,
+                    BDataType,
+                    ComputeDataType,
+                    AccDataType,
+                    ATileDesc,
+                    BTileDesc,
+                    AMmaTileDesc,
+                    BMmaTileDesc,
+                    ABlockTransferSrcScalarPerVector,
+                    BBlockTransferSrcScalarPerVector,
+                    MPerBlock,
+                    NPerBlock,
+                    KPerBlock,
+                    MPerXDL,
+                    NPerXDL,
+                    MRepeat,
+                    NRepeat,
+                    KPack>{};
+            }
+            else
+            {
+                return BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlkGemmPipeSche,
+                                                                   BlockSize,
+                                                                   ADataType,
+                                                                   BDataType,
+                                                                   ComputeDataType,
+                                                                   AccDataType,
+                                                                   ATileDesc,
+                                                                   BTileDesc,
+                                                                   AMmaTileDesc,
+                                                                   BMmaTileDesc,
+                                                                   ABlockTransferSrcScalarPerVector,
+                                                                   BBlockTransferSrcScalarPerVector,
+                                                                   MPerBlock,
+                                                                   NPerBlock,
+                                                                   KPerBlock,
+                                                                   MPerXDL,
+                                                                   NPerXDL,
+                                                                   MRepeat,
+                                                                   NRepeat,
+                                                                   KPack>{};
+            }
+        }
+        else
+        {
+            if constexpr(GUFusion)
+            {
+                return BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1<
+                    BlkGemmPipeSche,
+                    BlockSize,
+                    ADataType,
+                    BDataType,
+                    ComputeDataType,
+                    AccDataType,
+                    ATileDesc,
+                    BTileDesc,
+                    AMmaTileDesc,
+                    BMmaTileDesc,
+                    ABlockTransferSrcScalarPerVector,
+                    BBlockTransferSrcScalarPerVector,
+                    MPerBlock,
+                    NPerBlock,
+                    KPerBlock,
+                    MPerXDL,
+                    NPerXDL,
+                    MRepeat,
+                    NRepeat,
+                    KPack>{};
+            }
+            else
+            {
+                return BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v1<
+                    BlkGemmPipeSche,
+                    BlockSize,
+                    ADataType,
+                    BDataType,
+                    ComputeDataType,
+                    AccDataType,
+                    ATileDesc,
+                    BTileDesc,
+                    AMmaTileDesc,
+                    BMmaTileDesc,
+                    ABlockTransferSrcScalarPerVector,
+                    BBlockTransferSrcScalarPerVector,
+                    MPerBlock,
+                    NPerBlock,
+                    KPerBlock,
+                    MPerXDL,
+                    NPerXDL,
+                    MRepeat,
+                    NRepeat,
+                    KPack>{};
+            }
+        }
     }
     else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
     {
@@ -80,26 +169,53 @@ constexpr auto BlockGemmBPreshufflePipeline_Selector()
     else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
     {
         static_assert(MRepeat >= 4, "MRepeat should at least be 4 in BlockGemmPipelineVersion::v3");
-        return BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlkGemmPipeSche,
-                                                           BlockSize,
-                                                           ADataType,
-                                                           BDataType,
-                                                           ComputeDataType,
-                                                           AccDataType,
-                                                           ATileDesc,
-                                                           BTileDesc,
-                                                           AMmaTileDesc,
-                                                           BMmaTileDesc,
-                                                           ABlockTransferSrcScalarPerVector,
-                                                           BBlockTransferSrcScalarPerVector,
-                                                           MPerBlock,
-                                                           NPerBlock,
-                                                           KPerBlock,
-                                                           MPerXDL,
-                                                           NPerXDL,
-                                                           MRepeat,
-                                                           NRepeat,
-                                                           KPack>{};
+        if constexpr(std::is_same<ADataType, BDataType>::value)
+        {
+            return BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlkGemmPipeSche,
+                                                               BlockSize,
+                                                               ADataType,
+                                                               BDataType,
+                                                               ComputeDataType,
+                                                               AccDataType,
+                                                               ATileDesc,
+                                                               BTileDesc,
+                                                               AMmaTileDesc,
+                                                               BMmaTileDesc,
+                                                               ABlockTransferSrcScalarPerVector,
+                                                               BBlockTransferSrcScalarPerVector,
+                                                               MPerBlock,
+                                                               NPerBlock,
+                                                               KPerBlock,
+                                                               MPerXDL,
+                                                               NPerXDL,
+                                                               MRepeat,
+                                                               NRepeat,
+                                                               KPack>{};
+        }
+        else
+        {
+            return BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v3<
+                BlkGemmPipeSche,
+                BlockSize,
+                ADataType,
+                BDataType,
+                ComputeDataType,
+                AccDataType,
+                ATileDesc,
+                BTileDesc,
+                AMmaTileDesc,
+                BMmaTileDesc,
+                ABlockTransferSrcScalarPerVector,
+                BBlockTransferSrcScalarPerVector,
+                MPerBlock,
+                NPerBlock,
+                KPerBlock,
+                MPerXDL,
+                NPerXDL,
+                MRepeat,
+                NRepeat,
+                KPack>{};
+        }
     }
     else
     {
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
index 8ed25895b5..d751543175 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
@@ -141,6 +141,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
 
     using Base::AMmaKStride;
     using Base::BMmaKStride;
+    using Base::MWaves;
 
     static constexpr index_t PrefetchStages  = 2;
     static constexpr index_t PrefillStages   = 1;
@@ -184,12 +185,19 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
     {
         constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
         constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
-        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
-
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num * MWaves;
+        constexpr auto mfma_interleave        = MPerXDL == 32 ? 1 : 2;
         // B global
         static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
             ignore = i;
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            if constexpr(MPerBlock >= 128 && NPerBlock >= 128)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x008, 2 * mfma_interleave, 0);
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x008, mfma_interleave, 0);
+            }
             __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
         });
 
@@ -203,11 +211,12 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
         });
 
         // A local
-        static_for<0, num_ds_read_inst_a / 2, 1>{}([&](auto i) {
-            ignore = i;
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS read
-        });
+        static_for<0, MPerXDL == 32 ? num_ds_read_inst_a / 2 : num_ds_read_inst_a, 1>{}(
+            [&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);                     // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, MPerXDL == 32 ? 2 : 1, 0); // DS read
+            });
     }
 
     template <bool HasMainLoop,
@@ -320,7 +329,6 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
                                                      [Number<b_thread_desc_.CalculateOffset(
                                                          make_tuple(n0, I0, k0, ik))>{}];
                                 });
-
                                 using mfma_input_type =
                                     typename vector_type<ComputeDataType,
                                                          xdlops_gemm.K1PerXdlops>::type;
@@ -445,7 +453,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
             // latency
             // __builtin_amdgcn_sched_barrier(0);
         }
-        else
+        else if constexpr(TailNum == TailNumber::Odd)
         {
             static_for<0, MRepeat, 1>{}([&](auto m0) {
                 static_for<0, NRepeat, 1>{}([&](auto n0) {
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
index 49af782132..6d115e7620 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
@@ -784,7 +784,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
             // latency
             // __builtin_amdgcn_sched_barrier(0);
         }
-        else
+        else if constexpr(TailNum == TailNumber::Odd)
         {
             static_for<0, MRepeat, 1>{}([&](auto m0) {
                 static_for<0, KRepeat, 1>{}([&](auto k0) {
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
index 45ed6845c2..ce507ca8d3 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -46,7 +46,8 @@ struct BlockwiseGemmXdlops_pipeline_base
     static constexpr index_t A_K0 = ATileDesc{}.GetLength(I0);
     static constexpr index_t B_K0 = BTileDesc{}.GetLength(I0);
     static constexpr index_t A_K1 = ATileDesc{}.GetLength(I2);
-    static constexpr index_t B_K1 = BTileDesc{}.GetLength(I2);
+    static constexpr index_t B_K1 =
+        BTileDesc{}.GetLength(Number < BTileDesc{}.GetNumOfDimension() == 4 ? 3 : 2 > {});
 
     static constexpr auto xdlops_gemm =
         XdlopsGemm<ComputeDataType, MPerXDL, NPerXDL, KPack, ComputeDataType, TransposeC>{};
@@ -181,6 +182,23 @@ struct BlockwiseGemmXdlops_pipeline_base
 
     using Tuple4 = decltype(CalculateAThreadOriginDataIndex());
 
+    /**
+     * @brief Constructor for BlockwiseGemmXdlops_pipeline_base.
+     *
+     * This constructor initializes the thread copy objects for matrices A and B.
+     * It also performs several compile-time checks to ensure the correctness of the
+     * matrix tile descriptors.
+     *
+     * @param a_origin The origin data index for matrix A.
+     * @param b_origin The origin data index for matrix B.
+     *
+     * @note The constructor includes static assertions to ensure that:
+     * - The matrix tile descriptors for A and B are known at compile-time.
+     * - The number of threads in the thread block matches the product of MWaves, NWaves, and
+     * WaveSize.
+     * - The dimensions of the block are divisible by the product of the corresponding XDL and
+     * repeat dimensions.
+     */
     __host__ __device__
     BlockwiseGemmXdlops_pipeline_base(Tuple4 a_origin = CalculateAThreadOriginDataIndex(),
                                       Tuple4 b_origin = CalculateBThreadOriginDataIndex())
@@ -316,7 +334,7 @@ struct BlockwiseGemmXdlops_pipeline_base
         return xdlops_gemm.MakeCDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(
             c_grid_desc_g_m0_n0_m1_n1_m2_n2);
     }
-
+    __host__ __device__ static constexpr auto GetCThreadDesc() { return c_thread_desc_; }
     static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_k;
     static constexpr BMmaTileDesc b_block_desc_n0_n1_n2_k;
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_selector.hpp
new file mode 100644
index 0000000000..c1433659d6
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_selector.hpp
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_mx.hpp"
+
+namespace ck {
+
+/**
+ * @brief Define matrix data types that have hardware support for MX GEMMs
+ */
+template <typename T>
+static constexpr bool is_scale_mfma_data_type()
+{
+    return is_same_v<T, f8_ocp_t> || is_same_v<T, bf8_ocp_t> || is_same_v<T, f6_t> ||
+           is_same_v<T, bf6_t> || is_same_v<T, f4_t>;
+}
+
+/**
+ * @brief Define scale data types that have hardware support for MX GEMMs
+ */
+template <typename T>
+static constexpr bool is_scale_mfma_scale_type()
+{
+    return is_same_v<T, e8m0_bexp_t>;
+}
+
+/**
+ * @brief Combination of data types that have hardware support for MX GEMMs
+ */
+template <typename ADataType, typename BDataType, typename AScaleDataType, typename BScaleDataType>
+static constexpr bool scale_mfma_hw_support()
+{
+    return is_scale_mfma_data_type<ADataType>() && is_scale_mfma_data_type<BDataType>() &&
+           is_scale_mfma_scale_type<AScaleDataType>() && is_scale_mfma_scale_type<BScaleDataType>();
+}
+
+template <BlockGemmPipelineVersion BlkGemmPipelineVer,
+          BlockGemmPipelineScheduler BlkGemmPipeSche,
+          index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ComputeDataType, // TODO: remove this as in this pipeline ADataType and BDataType
+                                    // must be used for compute
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack>
+constexpr auto BlockGemmMXPipeline_Selector()
+{
+
+    // Hardware MX GEMM pipeline
+    if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+    {
+        return BlockwiseGemmXdlops_pipeline_v1_mx<BlkGemmPipeSche,
+                                                  ThreadBlockSize,
+                                                  ScaleBlockSize,
+                                                  ADataType,
+                                                  AScaleDataType,
+                                                  BDataType,
+                                                  BScaleDataType,
+                                                  ATileDesc,
+                                                  BTileDesc,
+                                                  AMmaTileDesc,
+                                                  BMmaTileDesc,
+                                                  ABlockTransferSrcScalarPerVector,
+                                                  BBlockTransferSrcScalarPerVector,
+                                                  MPerBlock,
+                                                  NPerBlock,
+                                                  KPerBlock,
+                                                  MPerXDL,
+                                                  NPerXDL,
+                                                  MRepeat,
+                                                  NRepeat,
+                                                  KPack>{};
+    }
+    else
+    {
+        std::cerr << "MX GEMM Pipeline configuration is not available" << std::endl;
+    }
+}
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_ab_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_ab_scale.hpp
index 821bbb0051..8375e81fa0 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_ab_scale.hpp
@@ -7,10 +7,10 @@
 
 namespace ck {
 
-// Naive pipeline with lowest resource request per WGP
-// GlobalPrefetchStages: 1
+// Compute optimized pipeline
+// GlobalPrefetchStages: 2
 // LocalPreFillStages: 1
-// LocalPreFetchStages: 0
+// LocalPreFetchStages: 1
 // LocalSharedMemoryBuffer: 1
 
 template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
@@ -96,7 +96,8 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
                                         NPerXDL,
                                         MRepeat,
                                         NRepeat,
-                                        KPack>
+                                        KPack,
+                                        true>
 
 {
     using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
@@ -117,10 +118,15 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
                                                    NPerXDL,
                                                    MRepeat,
                                                    NRepeat,
-                                                   KPack>;
+                                                   KPack,
+                                                   true>;
+    using Base::A_K1;
+    using Base::B_K1;
     using Base::I0;
+    using Base::I1;
     using Base::KRepeat;
     using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
 
     using Base::CalculateCThreadOriginDataIndex;
     using Base::CalculateCThreadOriginDataIndex8D;
@@ -131,19 +137,43 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
     using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
     using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
     using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetWaveIdx;
     using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
     using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
 
     using Base::a_block_desc_m0_m1_m2_k;
     using Base::b_block_desc_n0_n1_n2_k;
 
-    using Base::AMmaKStride;
-    using Base::BMmaKStride;
+    static constexpr index_t AMmaKStride = xdlops_gemm.K0PerXdlops * KPack;
+    static constexpr index_t BMmaKStride = xdlops_gemm.K0PerXdlops * KPack;
 
-    static constexpr index_t PrefetchStages  = 1;
+    static constexpr index_t PrefetchStages  = 2;
     static constexpr index_t PrefillStages   = 1;
     static constexpr index_t GlobalBufferNum = 1;
 
+    // Force mfma not cross the scaleblock
+    __device__ static auto CalculateAThreadOriginDataIndex()
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_m = wave_idx[I0];
+
+        const auto xdlops_a_idx = xdlops_gemm.CalculateAThreadOriginDataIndex();
+
+        return make_tuple(0, waveId_m, xdlops_a_idx[I1], KPack * xdlops_a_idx[I0]);
+    }
+
+    __device__ static auto CalculateBThreadOriginDataIndex()
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_n = wave_idx[I1];
+
+        const auto xdlops_b_idx = xdlops_gemm.CalculateBThreadOriginDataIndex();
+
+        return make_tuple(0, waveId_n, xdlops_b_idx[I1], KPack * xdlops_b_idx[I0]);
+    }
+
     __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
     {
         return num_loop > PrefetchStages;
@@ -151,11 +181,116 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
 
     __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
     {
-        ignore = num_loop;
-        return TailNumber::Full;
+        return num_loop == 1 ? TailNumber::Odd : TailNumber::Full;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        // A/B split schedule
+        // compiler is likely to use ds_read2 when instruction width smaller than 16bytes
+        constexpr auto num_ds_read_inst_a =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16
+                ? HotLoopInstList::A_LDS_Read_Inst_Num
+                : HotLoopInstList::A_LDS_Read_Inst_Num / 2;
+        constexpr auto num_ds_read_inst_b =
+            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16
+                ? HotLoopInstList::B_LDS_Read_Inst_Num
+                : HotLoopInstList::B_LDS_Read_Inst_Num / 2;
+
+        constexpr auto num_ds_write_inst_a = HotLoopInstList::A_LDS_Write_Inst_Num;
+        constexpr auto num_ds_write_inst_b = HotLoopInstList::B_LDS_Write_Inst_Num;
+
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
+
+        constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num;
+
+        constexpr auto mfma_cycle = NPerXDL == 16 ? 16 : 32;
+        constexpr auto ds_read_a_issue_cycle =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_b_issue_cycle =
+            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_a_mfma_rate =
+            (mfma_cycle - 4 + 2 * ds_read_a_issue_cycle - 1) / (2 * ds_read_a_issue_cycle);
+        constexpr auto ds_read_b_mfma_rate =
+            (mfma_cycle - 4 + 2 * ds_read_b_issue_cycle - 1) / (2 * ds_read_b_issue_cycle);
+
+        constexpr auto num_dsread_a_mfma =
+            (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate;
+        constexpr auto num_dsread_b_mfma =
+            (num_ds_read_inst_b + ds_read_b_mfma_rate - 1) / ds_read_b_mfma_rate;
+
+        // stage 1
+        // Separate this part?
+        // constexpr auto num_mfma_per_ds_read = sizeof(ComputeDataType) / sizeof(ADataType) >
+        //                                               sizeof(ComputeDataType) / sizeof(BDataType)
+        //                                           ? sizeof(ComputeDataType) / sizeof(ADataType)
+        //                                           : sizeof(ComputeDataType) / sizeof(BDataType);
+        constexpr auto num_mfma_stage1 = num_mfma_inst - (num_dsread_a_mfma + num_dsread_b_mfma);
+        constexpr auto num_mfma_per_issue =
+            num_mfma_stage1 / (num_buffer_load_inst_a + num_buffer_load_inst_b);
+        constexpr auto num_dswrite_per_issue_a = num_ds_write_inst_a / num_buffer_load_inst_a;
+        constexpr auto num_dswrite_per_issue_b = num_ds_write_inst_b / num_buffer_load_inst_b;
+
+        static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
+            ignore = i;
+            static_for<0, num_dswrite_per_issue_a, 1>{}([&](auto idswrite) {
+                ignore = idswrite;
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(
+                0x008, num_mfma_per_issue - num_dswrite_per_issue_a, 0); // MFMA
+        });
+        static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
+            ignore = i;
+            static_for<0, num_dswrite_per_issue_b, 1>{}([&](auto idswrite) {
+                ignore = idswrite;
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(
+                0x008, num_mfma_per_issue - num_dswrite_per_issue_b, 0); // MFMA
+        });
+
+        // stage 2
+        static_for<0, num_dsread_a_mfma, 1>{}([&](auto i) {
+            if constexpr((num_ds_read_inst_a - (i + 1) * ds_read_a_mfma_rate) >=
+                         ds_read_a_mfma_rate)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100,
+                                                     num_ds_read_inst_a - (num_dsread_a_mfma - 1) *
+                                                                              ds_read_a_mfma_rate,
+                                                     0); // DS read
+            }
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+        });
+
+        static_for<0, num_dsread_b_mfma, 1>{}([&](auto i) {
+            if constexpr((num_ds_read_inst_b - (i + 1) * ds_read_b_mfma_rate) >=
+                         ds_read_b_mfma_rate)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_b_mfma_rate, 0); // DS read
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100,
+                                                     num_ds_read_inst_b - (num_dsread_b_mfma - 1) *
+                                                                              ds_read_b_mfma_rate,
+                                                     0); // DS read
+            }
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+        });
     }
 
     template <bool HasMainLoop,
+              int NumKBlockPerScale,
               TailNumber TailNum,
               typename AGridDesc,
               typename ABlockDesc,
@@ -169,6 +304,7 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
               typename BGridBuffer,
               typename BBlockBuffer,
               typename BBlockTransferStep,
+              typename CScaleThreadDesc,
               typename CThreadBuffer,
               typename AScaleGridBuffer,
               typename AScaleGridDesc,
@@ -196,6 +332,7 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
         BBlockBuffer& b_block_buf,
         const BBlockTransferStep& b_block_copy_step,
         // CThread
+        const CScaleThreadDesc& c_scale_thread_desc,
         CThreadBuffer& c_thread_buf,
         // AScaleThreadCopy
         const AScaleGridDesc& a_scale_grid_desc,
@@ -210,11 +347,10 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
         const BScaleGridBuffer& b_scale_grid_buf,
         const BScaleThreadTransferStep& b_scale_thread_copy_step,
         // num_loop
-        index_t num_loop,
-        index_t num_loop_per_scale) const
+        index_t num_loop) const
     {
+        __builtin_amdgcn_sched_barrier(0);
         // assume kperblock = scaleblockk
-        ignore            = num_loop_per_scale;
         auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
             a_thread_desc_.GetElementSpaceSize());
         auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
@@ -223,6 +359,8 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
             a_scale_thread_desc.GetElementSpaceSize());
         auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
             b_scale_thread_desc.GetElementSpaceSize());
+        auto c_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            c_scale_thread_desc.GetElementSpaceSize());
 
         // Global prefetch 1
         a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
@@ -231,11 +369,26 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
         a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
         b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
 
-        a_scale_thread_copy.Run(a_scale_grid_desc,
-                                a_scale_grid_buf,
-                                a_scale_thread_desc,
-                                make_tuple(I0, I0),
-                                a_scale_thread_buf);
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            a_scale_thread_copy.Run(a_scale_grid_desc,
+                                    a_scale_grid_buf,
+                                    a_scale_thread_desc,
+                                    make_tuple(m0, I0),
+                                    a_scale_thread_buf);
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<0>{}));
+        });
+
+        if constexpr(NumKBlockPerScale == 1)
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<2>{}));
+        }
+        else
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<1>{}));
+        }
 
         b_scale_thread_copy.Run(b_scale_grid_desc,
                                 b_scale_grid_buf,
@@ -243,17 +396,101 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
                                 make_tuple(I0, I0),
                                 b_scale_thread_buf);
 
-        a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc, a_scale_thread_copy_step);
         b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
 
+        constexpr auto num_scale_k_block = CScaleThreadDesc{}.GetLength(Number<0>{});
+        constexpr auto num_scale_m_block = CScaleThreadDesc{}.GetLength(Number<1>{});
+        constexpr auto num_scale_n_block = CScaleThreadDesc{}.GetLength(Number<2>{});
+
+        static_for<0, num_scale_m_block, 1>{}([&](auto m0) {
+            static_for<0, num_scale_n_block, 1>{}([&](auto n0) {
+                static_for<0, num_scale_k_block, 1>{}([&](auto k0) {
+                    constexpr index_t c_offset =
+                        CScaleThreadDesc{}.CalculateOffset(make_tuple(k0, m0, n0));
+                    constexpr index_t a_offset =
+                        AScaleThreadDesc{}.CalculateOffset(make_tuple(m0, k0));
+                    constexpr index_t b_offset =
+                        BScaleThreadDesc{}.CalculateOffset(make_tuple(n0, k0));
+
+                    c_scale_thread_buf(Number<c_offset>{}) =
+                        a_scale_thread_buf[Number<a_offset>{}] *
+                        b_scale_thread_buf[Number<b_offset>{}];
+                });
+            });
+        });
+
         // Local prefill 1
         a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
         b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
 
+        // Global prefetch 2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            a_scale_thread_copy.Run(a_scale_grid_desc,
+                                    a_scale_grid_buf,
+                                    a_scale_thread_desc,
+                                    make_tuple(m0, I0),
+                                    a_scale_thread_buf);
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<0>{}));
+        });
+
+        if constexpr(NumKBlockPerScale == 1)
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<2>{}));
+        }
+        else
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<1>{}));
+        }
+
+        b_scale_thread_copy.Run(b_scale_grid_desc,
+                                b_scale_grid_buf,
+                                b_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                b_scale_thread_buf);
+
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+
         // Initialize C
         c_thread_buf.Clear();
 
-        auto c_thread_buf_per_scale = remove_cvref_t<decltype(c_thread_buf)>();
+        StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                                  AccDataType,
+                                  1,
+                                  xdlops_gemm.GetRegSizePerXdlops(),
+                                  true>
+            c_thread_buf_per_scale;
+
+        // Local prefetch 1
+        block_sync_lds();
+        static_for<0, KRepeat, 1>{}([&](auto k0) {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                   make_tuple(m0, I0, I0, Number<k0 * AMmaKStride>{}),
+                                   a_block_buf,
+                                   a_thread_desc_,
+                                   make_tuple(m0, I0, k0, I0),
+                                   a_thread_buf);
+            });
+            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                   make_tuple(n0, I0, I0, Number<k0 * BMmaKStride>{}),
+                                   b_block_buf,
+                                   b_thread_desc_,
+                                   make_tuple(n0, I0, k0, I0),
+                                   b_thread_buf);
+            });
+        });
+
+        __builtin_amdgcn_sched_barrier(0);
 
         // main body
         if constexpr(HasMainLoop)
@@ -261,13 +498,85 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
             index_t i = 0;
             do
             {
-                // -------------------------------------------------------------------------------------------
+                block_sync_lds();
+                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
                 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
                 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
 
                 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
                 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
 
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        static_for<0, num_scale_k_block, 1>{}([&](auto kscale0) {
+                            static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<AccDataType>()(Number<t>{}) = 0;
+                            });
+                            static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) {
+                                vector_type<ComputeDataType, KPack> a_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                                static_for<0, KPack, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                            make_tuple(m0,
+                                                       I0,
+                                                       kscale0 * KRepeat / num_scale_k_block + k0,
+                                                       ik))>{}];
+                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                            make_tuple(n0,
+                                                       I0,
+                                                       kscale0 * KRepeat / num_scale_k_block + k0,
+                                                       ik))>{}];
+                                });
+
+                                using mfma_input_type =
+                                    typename vector_type<ComputeDataType,
+                                                         xdlops_gemm.K1PerXdlops>::type;
+
+                                xdlops_gemm.template Run<>(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec.template AsType<mfma_input_type>(),
+                                    c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                            });
+                            static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t));
+                                constexpr index_t cscale_offset =
+                                    CScaleThreadDesc{}.CalculateOffset(
+                                        make_tuple(kscale0, m0, n0 * num_scale_n_block / NRepeat));
+
+                                c_thread_buf(Number<c_offset>{}) +=
+                                    c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                        .template AsType<AccDataType>()[Number<t>{}] *
+                                    type_convert<AccDataType>(
+                                        c_scale_thread_buf[Number<cscale_offset>{}]);
+                            });
+                        });
+                    });
+                });
+
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, num_scale_n_block, 1>{}([&](auto n0) {
+                        static_for<0, num_scale_k_block, 1>{}([&](auto k0) {
+                            constexpr index_t c_offset =
+                                CScaleThreadDesc{}.CalculateOffset(make_tuple(k0, m0, n0));
+                            constexpr index_t a_offset =
+                                AScaleThreadDesc{}.CalculateOffset(make_tuple(m0, k0));
+                            constexpr index_t b_offset =
+                                BScaleThreadDesc{}.CalculateOffset(make_tuple(n0, k0));
+
+                            c_scale_thread_buf(Number<c_offset>{}) =
+                                a_scale_thread_buf[Number<a_offset>{}] *
+                                b_scale_thread_buf[Number<b_offset>{}];
+                        });
+                    });
+                });
+
                 block_sync_lds();
                 static_for<0, KRepeat, 1>{}([&](auto k) {
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
@@ -289,19 +598,70 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
                 });
 
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
-                    static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        c_thread_buf_per_scale.Clear();
-                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    a_scale_thread_copy.Run(a_scale_grid_desc,
+                                            a_scale_grid_buf,
+                                            a_scale_thread_desc,
+                                            make_tuple(m0, I0),
+                                            a_scale_thread_buf);
+                    a_scale_thread_copy.MoveSrcSliceWindow(
+                        a_scale_grid_desc, a_scale_thread_copy_step.At(Number<0>{}));
+                });
+
+                if constexpr(NumKBlockPerScale == 1)
+                {
+                    a_scale_thread_copy.MoveSrcSliceWindow(
+                        a_scale_grid_desc, a_scale_thread_copy_step.At(Number<2>{}));
+                }
+                else
+                {
+                    a_scale_thread_copy.MoveSrcSliceWindow(
+                        a_scale_grid_desc, a_scale_thread_copy_step.At(Number<1>{}));
+                }
+
+                b_scale_thread_copy.Run(b_scale_grid_desc,
+                                        b_scale_grid_buf,
+                                        b_scale_thread_desc,
+                                        make_tuple(I0, I0),
+                                        b_scale_thread_buf);
+
+                b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+                HotLoopScheduler();
+                __builtin_amdgcn_sched_barrier(0);
+                i += 1;
+            } while(i < (num_loop - 2));
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Full)
+        {
+            block_sync_lds();
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+            b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, num_scale_k_block, 1>{}([&](auto kscale0) {
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                .template AsType<AccDataType>()(Number<t>{}) = 0;
+                        });
+                        static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) {
                             vector_type<ComputeDataType, KPack> a_thread_vec;
                             vector_type<ComputeDataType, KPack> b_thread_vec;
 
                             static_for<0, KPack, 1>{}([&](auto ik) {
                                 a_thread_vec.template AsType<ComputeDataType>()(ik) =
                                     a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                        make_tuple(m0, I0, k0, ik))>{}];
+                                        make_tuple(m0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   ik))>{}];
                                 b_thread_vec.template AsType<ComputeDataType>()(ik) =
                                     b_thread_buf[Number<b_thread_desc_.CalculateOffset(
-                                        make_tuple(n0, I0, k0, ik))>{}];
+                                        make_tuple(n0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   ik))>{}];
                             });
 
                             using mfma_input_type =
@@ -311,46 +671,41 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
                             xdlops_gemm.template Run<>(
                                 a_thread_vec.template AsType<mfma_input_type>(),
                                 b_thread_vec.template AsType<mfma_input_type>(),
-                                c_thread_buf_per_scale.GetVectorTypeReference(I0));
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
                         });
                         static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
                             constexpr index_t c_offset =
                                 c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t));
+                            constexpr index_t cscale_offset = CScaleThreadDesc{}.CalculateOffset(
+                                make_tuple(kscale0, m0, n0 * num_scale_n_block / NRepeat));
+
                             c_thread_buf(Number<c_offset>{}) +=
-                                c_thread_buf_per_scale[Number<t>{}] *
-                                type_convert<AccDataType>(a_scale_thread_buf[I0]) *
-                                type_convert<AccDataType>(b_scale_thread_buf[I0]);
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<AccDataType>()[Number<t>{}] *
+                                type_convert<AccDataType>(
+                                    c_scale_thread_buf[Number<cscale_offset>{}]);
                         });
                     });
                 });
+            });
 
-                a_scale_thread_copy.Run(a_scale_grid_desc,
-                                        a_scale_grid_buf,
-                                        a_scale_thread_desc,
-                                        make_tuple(I0, I0),
-                                        a_scale_thread_buf);
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, num_scale_n_block, 1>{}([&](auto n0) {
+                    static_for<0, num_scale_k_block, 1>{}([&](auto k0) {
+                        constexpr index_t c_offset =
+                            CScaleThreadDesc{}.CalculateOffset(make_tuple(k0, m0, n0));
+                        constexpr index_t a_offset =
+                            AScaleThreadDesc{}.CalculateOffset(make_tuple(m0, k0));
+                        constexpr index_t b_offset =
+                            BScaleThreadDesc{}.CalculateOffset(make_tuple(n0, k0));
 
-                b_scale_thread_copy.Run(b_scale_grid_desc,
-                                        b_scale_grid_buf,
-                                        b_scale_thread_desc,
-                                        make_tuple(I0, I0),
-                                        b_scale_thread_buf);
+                        c_scale_thread_buf(Number<c_offset>{}) =
+                            a_scale_thread_buf[Number<a_offset>{}] *
+                            b_scale_thread_buf[Number<b_offset>{}];
+                    });
+                });
+            });
 
-                a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc, a_scale_thread_copy_step);
-                b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
-
-                block_sync_lds();
-                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
-                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
-
-                i += 1;
-
-            } while(i < (num_loop - 1));
-        }
-
-        // tail
-        if constexpr(TailNum == TailNumber::Full)
-        {
             block_sync_lds();
             static_for<0, KRepeat, 1>{}([&](auto k) {
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
@@ -371,49 +726,143 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
                 });
             });
 
+            HotLoopScheduler();
+            __builtin_amdgcn_sched_barrier(0);
+
             static_for<0, MRepeat, 1>{}([&](auto m0) {
                 static_for<0, NRepeat, 1>{}([&](auto n0) {
-                    c_thread_buf_per_scale.Clear();
-                    static_for<0, KRepeat, 1>{}([&](auto k0) {
-                        vector_type<ComputeDataType, KPack> a_thread_vec;
-                        vector_type<ComputeDataType, KPack> b_thread_vec;
-
-                        static_for<0, KPack, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
-                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                    make_tuple(m0, I0, k0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
-                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(
-                                    make_tuple(n0, I0, k0, ik))>{}];
+                    static_for<0, num_scale_k_block, 1>{}([&](auto kscale0) {
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                .template AsType<AccDataType>()(Number<t>{}) = 0;
                         });
+                        static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
 
-                        using mfma_input_type =
-                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(m0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   ik))>{}];
+                            });
 
-                        xdlops_gemm.template Run<>(
-                            a_thread_vec.template AsType<mfma_input_type>(),
-                            b_thread_vec.template AsType<mfma_input_type>(),
-                            c_thread_buf_per_scale.GetVectorTypeReference(I0));
-                    });
-                    static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
-                        constexpr index_t c_offset =
-                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t));
-                        c_thread_buf(Number<c_offset>{}) +=
-                            c_thread_buf_per_scale[Number<t>{}] *
-                            type_convert<AccDataType>(a_scale_thread_buf[I0]) *
-                            type_convert<AccDataType>(b_scale_thread_buf[I0]);
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                        });
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t));
+                            constexpr index_t cscale_offset = CScaleThreadDesc{}.CalculateOffset(
+                                make_tuple(kscale0, m0, n0 * num_scale_n_block / NRepeat));
+
+                            c_thread_buf(Number<c_offset>{}) +=
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<AccDataType>()[Number<t>{}] *
+                                type_convert<AccDataType>(
+                                    c_scale_thread_buf[Number<cscale_offset>{}]);
+                        });
                     });
                 });
             });
+            __builtin_amdgcn_sched_barrier(0);
+        }
+        else if constexpr(TailNum == TailNumber::Odd)
+        {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, num_scale_k_block, 1>{}([&](auto kscale0) {
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                .template AsType<AccDataType>()(Number<t>{}) = 0;
+                        });
+                        static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(m0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                        });
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t));
+                            constexpr index_t cscale_offset = CScaleThreadDesc{}.CalculateOffset(
+                                make_tuple(kscale0, m0, n0 * num_scale_n_block / NRepeat));
+
+                            c_thread_buf(Number<c_offset>{}) +=
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<AccDataType>()[Number<t>{}] *
+                                type_convert<AccDataType>(
+                                    c_scale_thread_buf[Number<cscale_offset>{}]);
+                        });
+                    });
+                });
+            });
+            __builtin_amdgcn_sched_barrier(0);
         }
     }
 
     protected:
-    using Base::a_thread_copy_;
     using Base::a_thread_desc_;
-    using Base::b_thread_copy_;
     using Base::b_thread_desc_;
     using Base::c_thread_desc_;
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                                         ComputeDataType,
+                                                         decltype(a_block_desc_m0_m1_m2_k),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, KPack>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         A_K1,
+                                                         A_K1>;
+
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<BDataType,
+                                                         ComputeDataType,
+                                                         decltype(b_block_desc_n0_n1_n2_k),
+                                                         decltype(b_thread_desc_),
+                                                         Sequence<1, 1, 1, KPack>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         B_K1,
+                                                         B_K1>;
+
+    AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex()};
+    BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()};
 };
 
 } // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_mx.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_mx.hpp
new file mode 100644
index 0000000000..9acf401410
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_mx.hpp
@@ -0,0 +1,613 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Naive pipeline with lowest resource request per WGP
+// GlobalPrefetchStages: 1
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 0
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_v1_mx
+{
+};
+
+template <index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_v1_mx<BlockGemmPipelineScheduler::Intrawave,
+                                          ThreadBlockSize,
+                                          ScaleBlockSize,
+                                          ADataType,
+                                          AScaleDataType,
+                                          BDataType,
+                                          BScaleDataType,
+                                          ATileDesc,
+                                          BTileDesc,
+                                          AMmaTileDesc,
+                                          BMmaTileDesc,
+                                          ABlockTransferSrcScalarPerVector,
+                                          BBlockTransferSrcScalarPerVector,
+                                          MPerBlock,
+                                          NPerBlock,
+                                          KPerBlock,
+                                          MPerXDL,
+                                          NPerXDL,
+                                          MRepeat,
+                                          NRepeat,
+                                          KPack>
+    : BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                           ADataType,
+                                           BDataType,
+                                           ATileDesc,
+                                           BTileDesc,
+                                           AMmaTileDesc,
+                                           BMmaTileDesc,
+                                           ABlockTransferSrcScalarPerVector,
+                                           BBlockTransferSrcScalarPerVector,
+                                           MPerBlock,
+                                           NPerBlock,
+                                           KPerBlock,
+                                           MPerXDL,
+                                           NPerXDL,
+                                           MRepeat,
+                                           NRepeat,
+                                           KPack>
+
+{
+
+    using Base = BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                                      ADataType,
+                                                      BDataType,
+                                                      ATileDesc,
+                                                      BTileDesc,
+                                                      AMmaTileDesc,
+                                                      BMmaTileDesc,
+                                                      ABlockTransferSrcScalarPerVector,
+                                                      BBlockTransferSrcScalarPerVector,
+                                                      MPerBlock,
+                                                      NPerBlock,
+                                                      KPerBlock,
+                                                      MPerXDL,
+                                                      NPerXDL,
+                                                      MRepeat,
+                                                      NRepeat,
+                                                      KPack>;
+    using Base::I0;
+    using Base::I1;
+    using Base::KRepeat;
+    using Base::MWaves;
+    using Base::NWaves;
+    using Base::WaveSize;
+    using Base::xdlops_gemm;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetWaveIdx;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::b_block_desc_n0_n1_n2_k;
+
+    using Base::AMmaKStride;
+    using Base::BMmaKStride;
+    using Base::KThreadChunk;
+
+    using AccType      = typename Base::AccType;
+    using Tuple4       = typename Base::Tuple4;
+    using ComputeTypeA = typename Base::ComputeTypeA;
+    using ComputeTypeB = typename Base::ComputeTypeB;
+
+    static constexpr index_t PrefetchStages  = 1;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 1;
+
+    static constexpr auto ScalesPerKBlockSize =
+        KPerBlock / ScaleBlockSize; // How many mx-vectors per K block
+
+    //> How many mx-vectors in each row/col is processed in one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRun = (KPack * xdlops_gemm.K0PerXdlops) / ScaleBlockSize;
+
+    //> How many scales a thread must read to accommodate one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRunPerThread =
+        ScalesPerXdlopsRun / xdlops_gemm.mfma_instr.num_input_blks;
+
+    __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        ignore = num_loop;
+        return TailNumber::Full;
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer,
+              typename AScaleGridBuffer,
+              typename AScaleGridDesc,
+              typename AScaleThreadTransfer,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadTransfer>
+    __device__ void Run(
+        // ABlockCopy
+        const AGridDesc& a_grid_desc,
+        const ABlockDesc& a_block_desc,
+        ABlockTransfer& a_blockwise_copy,
+        const AGridBuffer& a_grid_buf,
+        ABlockBuffer& a_block_buf,
+        const ABlockTransferStep& a_block_copy_step,
+        // BBlockCopy
+        const BGridDesc& b_grid_desc,
+        const BBlockDesc& b_block_desc,
+        BBlockTransfer& b_blockwise_copy,
+        const BGridBuffer& b_grid_buf,
+        BBlockBuffer& b_block_buf,
+        const BBlockTransferStep& b_block_copy_step,
+        // CThread
+        CThreadBuffer& c_thread_buf,
+        // A and B scales
+        const AScaleGridDesc& a_scale_grid_desc,
+        AScaleThreadTransfer& a_scale_thread_copy,
+        const AScaleGridBuffer& a_scale_grid_buf,
+        const BScaleGridDesc& b_scale_grid_desc,
+        BScaleThreadTransfer& b_scale_thread_copy,
+        const BScaleGridBuffer& b_scale_grid_buf,
+        index_t num_loop) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
+            a_scale_thread_desc.GetElementSpaceSize());
+
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+
+        // Global prefetch 1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Prefetch a_scales
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                    constexpr auto a_scale_offset =
+                        a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, s));
+                    auto a_scale_thread_buf_copy =
+                        make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
+                            a_scale_thread_desc_copy.GetElementSpaceSize());
+                    a_scale_thread_copy.Run(a_scale_grid_desc,
+                                            a_scale_grid_buf,
+                                            a_scale_thread_desc_copy,
+                                            make_tuple(I0, I0),
+                                            a_scale_thread_buf_copy);
+
+                    a_scale_thread_buf(Number<a_scale_offset>{}) =
+                        a_scale_thread_buf_copy[Number<0>{}];
+                    a_scale_thread_copy.MoveSrcSliceWindow(
+                        a_scale_grid_desc,
+                        make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
+                });
+            });
+            a_scale_thread_copy.MoveSrcSliceWindow(
+                a_scale_grid_desc, make_multi_index(MWaves * MPerXDL, -ScalesPerKBlockSize));
+        });
+
+        // restore row id and advance to the next set of scales
+        a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                               make_multi_index(-MPerBlock, ScalesPerKBlockSize));
+
+        // Prefetch b_scales
+        static_for<0, NRepeat, 1>{}([&](auto n0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                    constexpr auto b_scale_offset =
+                        b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, s));
+                    auto b_scale_thread_buf_copy =
+                        make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+                            b_scale_thread_desc_copy.GetElementSpaceSize());
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc_copy,
+                                            make_tuple(I0, I0),
+                                            b_scale_thread_buf_copy);
+
+                    b_scale_thread_buf(Number<b_scale_offset>{}) =
+                        b_scale_thread_buf_copy[Number<0>{}];
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc,
+                        make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
+                });
+            });
+            b_scale_thread_copy.MoveSrcSliceWindow(
+                b_scale_grid_desc, make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
+        });
+
+        // restore col id and advance to the next set of scales
+        // NWaves * NPerXDL * NRepeat == NPerBlock
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                               make_multi_index(-NPerBlock, ScalesPerKBlockSize));
+
+        // Local prefill 1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            // loop over k with the step KPerBlock
+            index_t i = 0;
+            do
+            {
+                // -------------------------------------------------------------------------------------------
+                a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                block_sync_lds();
+
+                // k indexes mapping to threads for 32x32x64:
+                // t0 : |0  --> 15 32 --> 47 | 64 --> 79 96  --> 111 | etc.
+                // t32: |16 --> 31 48 --> 63 | 80 --> 95 112 --> 127 | etc.
+                //              k = 0                 k = 1
+
+                //  k indexes mapping to threads for 16x16x128:
+                // t0 : |0  --> 15 64  --> 79 | 128 --> 143 192 --> 207| etc.
+                // t16: |16 --> 31 80  --> 95 | 144 --> 159 208 --> 223| etc.
+                // t32: |32 --> 47 96  --> 111| 160 --> 175 224 --> 239| etc.
+                // t48: |48 --> 63 112 --> 127| 176 --> 191 240 --> 255| etc.
+                //              k = 0                    k = 1
+                static_for<0, KRepeat, 1>{}([&](auto k) {
+                    constexpr auto k_step =
+                        k * xdlops_gemm.KPerXdlops * (KPack / xdlops_gemm.K1PerXdlops);
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}([&](auto chunk) {
+                            constexpr auto a_k_step_chunk =
+                                k_step +
+                                chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k,
+                                make_tuple(m0, I0, I0, Number<a_k_step_chunk>{}),
+                                a_block_buf,
+                                a_thread_desc_,
+                                make_tuple(m0, I0, k, Number<chunk * KThreadChunk>{}),
+                                a_thread_buf);
+                        });
+                    });
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        // read block data in chunks to assemble correct thread vectors
+                        static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}([&](auto chunk) {
+                            constexpr auto b_k_step_chunk =
+                                k_step +
+                                chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                            b_thread_copy_.Run(
+                                b_block_desc_n0_n1_n2_k,
+                                make_tuple(n0, I0, I0, Number<b_k_step_chunk>{}),
+                                b_block_buf,
+                                b_thread_desc_,
+                                make_tuple(n0, I0, k, Number<chunk * KThreadChunk>{}),
+                                b_thread_buf);
+                        });
+                    });
+                });
+
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            vector_type<ComputeTypeA, KPack> a_thread_vec;
+                            vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, I0, k0, ik))>{}];
+                                b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0, I0, k0, ik))>{}];
+                            });
+
+                            constexpr index_t a_scale_offset =
+                                a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                            constexpr index_t b_scale_offset =
+                                b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                            static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                          "Must have at least one scale per Xdlops per Thread.");
+
+                            vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread>
+                                a_scale_thread_vec;
+                            vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread>
+                                b_scale_thread_vec;
+
+                            // Pack scale_thread_buf into scale_thread_vec
+                            static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                                a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                    a_scale_thread_buf[Number<a_scale_offset + s>{}];
+                                b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                    b_scale_thread_buf[Number<b_scale_offset + s>{}];
+                            });
+
+                            using mfma_input_type_a =
+                                typename vector_type<ComputeTypeA, xdlops_gemm.K1PerXdlops>::type;
+                            using mfma_input_type_b =
+                                typename vector_type<ComputeTypeB, xdlops_gemm.K1PerXdlops>::type;
+
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                            // MFMA accumulation
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type_a>(),
+                                a_scale_thread_vec.template AsType<AScaleDataType>(),
+                                b_thread_vec.template AsType<mfma_input_type_b>(),
+                                b_scale_thread_vec.template AsType<BScaleDataType>(),
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        });
+                    });
+                });
+
+                // Prefetch a_scales
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                            constexpr auto a_scale_offset =
+                                a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, s));
+                            auto a_scale_thread_buf_copy =
+                                make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
+                                    a_scale_thread_desc_copy.GetElementSpaceSize());
+                            a_scale_thread_copy.Run(a_scale_grid_desc,
+                                                    a_scale_grid_buf,
+                                                    a_scale_thread_desc_copy,
+                                                    make_tuple(I0, I0),
+                                                    a_scale_thread_buf_copy);
+
+                            a_scale_thread_buf(Number<a_scale_offset>{}) =
+                                a_scale_thread_buf_copy[Number<0>{}];
+                            a_scale_thread_copy.MoveSrcSliceWindow(
+                                a_scale_grid_desc,
+                                make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
+                        });
+                    });
+                    a_scale_thread_copy.MoveSrcSliceWindow(
+                        a_scale_grid_desc,
+                        make_multi_index(MWaves * MPerXDL, -ScalesPerKBlockSize));
+                });
+
+                // restore row id and advance to the next set of scales
+                a_scale_thread_copy.MoveSrcSliceWindow(
+                    a_scale_grid_desc, make_multi_index(-MPerBlock, ScalesPerKBlockSize));
+
+                // Prefetch b_scales
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                            constexpr auto b_scale_offset =
+                                b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, s));
+                            auto b_scale_thread_buf_copy =
+                                make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+                                    b_scale_thread_desc_copy.GetElementSpaceSize());
+                            b_scale_thread_copy.Run(b_scale_grid_desc,
+                                                    b_scale_grid_buf,
+                                                    b_scale_thread_desc_copy,
+                                                    make_tuple(I0, I0),
+                                                    b_scale_thread_buf_copy);
+
+                            b_scale_thread_buf(Number<b_scale_offset>{}) =
+                                b_scale_thread_buf_copy[Number<0>{}];
+                            b_scale_thread_copy.MoveSrcSliceWindow(
+                                b_scale_grid_desc,
+                                make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
+                        });
+                    });
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc,
+                        make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
+                });
+
+                // restore col id and advance to the next set of scales
+                // NWaves * NPerXDL * NRepeat == NPerBlock
+                b_scale_thread_copy.MoveSrcSliceWindow(
+                    b_scale_grid_desc, make_multi_index(-NPerBlock, ScalesPerKBlockSize));
+
+                block_sync_lds();
+                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+                i += 1;
+            } while(i < (num_loop - 1));
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Full)
+        {
+            block_sync_lds();
+
+            static_for<0, KRepeat, 1>{}([&](auto k) {
+                constexpr auto k_step =
+                    k * xdlops_gemm.KPerXdlops * (KPack / xdlops_gemm.K1PerXdlops);
+
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    // read block data in chunks to assemble correct thread
+                    static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}([&](auto chunk) {
+                        constexpr auto a_k_step_chunk =
+                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                           make_tuple(m0, I0, I0, Number<a_k_step_chunk>{}),
+                                           a_block_buf,
+                                           a_thread_desc_,
+                                           make_tuple(m0, I0, k, Number<chunk * KThreadChunk>{}),
+                                           a_thread_buf);
+                    });
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    // read block data in chunks to assemble correct thread
+                    static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}([&](auto chunk) {
+                        constexpr auto b_k_step_chunk =
+                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                           make_tuple(n0, I0, I0, Number<b_k_step_chunk>{}),
+                                           b_block_buf,
+                                           b_thread_desc_,
+                                           make_tuple(n0, I0, k, Number<chunk * KThreadChunk>{}),
+                                           b_thread_buf);
+                    });
+                });
+            });
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeTypeA, KPack> a_thread_vec;
+                        vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread> a_scale_thread_vec;
+                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread> b_scale_thread_vec;
+
+                        // Pack b_scale_thread_buf into b_scale_thread_vec
+                        static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_buf[Number<a_scale_offset + s>{}];
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_buf[Number<b_scale_offset + s>{}];
+                        });
+
+                        using mfma_input_type_a =
+                            typename vector_type<ComputeTypeA, xdlops_gemm.K1PerXdlops>::type;
+                        using mfma_input_type_b =
+                            typename vector_type<ComputeTypeB, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        // MFMA accumulation
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            a_scale_thread_vec.template AsType<AScaleDataType>(),
+                            b_thread_vec.template AsType<mfma_input_type_b>(),
+                            b_scale_thread_vec.template AsType<BScaleDataType>(),
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+        }
+    }
+
+    // TODO: make this field protected when a_scale_thread_copy_ is moved
+    // here
+    static constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, Number<KRepeat>{}, Number<ScalesPerXdlopsRunPerThread>{}));
+
+    // Is used to copy data from a_scale_grid to a_scale_thread
+    static constexpr auto a_scale_thread_desc_copy =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<1>{}));
+
+    // TODO: make this field protected when b_scale_thread_copy_ is moved
+    // here
+    static constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, Number<KRepeat>{}, Number<ScalesPerXdlopsRunPerThread>{}));
+
+    // Is used to copy data from b_scale_grid to b_scale_thread_buf
+    static constexpr auto b_scale_thread_desc_copy =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<1>{}));
+
+    protected:
+    using Base::a_thread_copy_;
+    using Base::a_thread_desc_;
+    using Base::b_thread_copy_;
+    using Base::b_thread_desc_;
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_ab_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_ab_scale.hpp
index 40fa776484..c8ad9c5b02 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_ab_scale.hpp
@@ -96,7 +96,8 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale<BlockGemmPipelineScheduler::Intr
                                         NPerXDL,
                                         MRepeat,
                                         NRepeat,
-                                        KPack>
+                                        KPack,
+                                        true>
 
 {
     using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
@@ -117,7 +118,8 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale<BlockGemmPipelineScheduler::Intr
                                                    NPerXDL,
                                                    MRepeat,
                                                    NRepeat,
-                                                   KPack>;
+                                                   KPack,
+                                                   true>;
     using Base::I0;
     using Base::KRepeat;
     using Base::xdlops_gemm;
@@ -270,11 +272,26 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale<BlockGemmPipelineScheduler::Intr
         a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
         b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
 
-        a_scale_thread_copy.Run(a_scale_grid_desc,
-                                a_scale_grid_buf,
-                                a_scale_thread_desc,
-                                make_tuple(I0, I0),
-                                a_scale_thread_buf);
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            a_scale_thread_copy.Run(a_scale_grid_desc,
+                                    a_scale_grid_buf,
+                                    a_scale_thread_desc,
+                                    make_tuple(m0, I0),
+                                    a_scale_thread_buf);
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<0>{}));
+        });
+
+        if(num_loop_per_scale == 1)
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<2>{}));
+        }
+        else
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<1>{}));
+        }
 
         b_scale_thread_copy.Run(b_scale_grid_desc,
                                 b_scale_grid_buf,
@@ -282,7 +299,6 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale<BlockGemmPipelineScheduler::Intr
                                 make_tuple(I0, I0),
                                 b_scale_thread_buf);
 
-        a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc, a_scale_thread_copy_step);
         b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
 
         // Local prefill 1
@@ -360,17 +376,32 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale<BlockGemmPipelineScheduler::Intr
                                     c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t));
                                 c_thread_buf(Number<c_offset>{}) +=
                                     c_thread_buf_per_scale[Number<t>{}] *
-                                    type_convert<AccDataType>(a_scale_thread_buf[I0]) *
+                                    type_convert<AccDataType>(a_scale_thread_buf[m0]) *
                                     type_convert<AccDataType>(b_scale_thread_buf[I0]);
                             });
                         });
                     });
 
-                    a_scale_thread_copy.Run(a_scale_grid_desc,
-                                            a_scale_grid_buf,
-                                            a_scale_thread_desc,
-                                            make_tuple(I0, I0),
-                                            a_scale_thread_buf);
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        a_scale_thread_copy.Run(a_scale_grid_desc,
+                                                a_scale_grid_buf,
+                                                a_scale_thread_desc,
+                                                make_tuple(m0, I0),
+                                                a_scale_thread_buf);
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, a_scale_thread_copy_step.At(Number<0>{}));
+                    });
+
+                    if(num_loop_per_scale == 1)
+                    {
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, a_scale_thread_copy_step.At(Number<2>{}));
+                    }
+                    else
+                    {
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, a_scale_thread_copy_step.At(Number<1>{}));
+                    }
 
                     b_scale_thread_copy.Run(b_scale_grid_desc,
                                             b_scale_grid_buf,
@@ -378,8 +409,6 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale<BlockGemmPipelineScheduler::Intr
                                             make_tuple(I0, I0),
                                             b_scale_thread_buf);
 
-                    a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
-                                                           a_scale_thread_copy_step);
                     b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
                                                            b_scale_thread_copy_step);
 
@@ -453,17 +482,32 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale<BlockGemmPipelineScheduler::Intr
                                 c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t));
                             c_thread_buf(Number<c_offset>{}) +=
                                 c_thread_buf_per_scale[Number<t>{}] *
-                                type_convert<AccDataType>(a_scale_thread_buf[I0]) *
+                                type_convert<AccDataType>(a_scale_thread_buf[m0]) *
                                 type_convert<AccDataType>(b_scale_thread_buf[I0]);
                         });
                     });
                 });
 
-                a_scale_thread_copy.Run(a_scale_grid_desc,
-                                        a_scale_grid_buf,
-                                        a_scale_thread_desc,
-                                        make_tuple(I0, I0),
-                                        a_scale_thread_buf);
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    a_scale_thread_copy.Run(a_scale_grid_desc,
+                                            a_scale_grid_buf,
+                                            a_scale_thread_desc,
+                                            make_tuple(m0, I0),
+                                            a_scale_thread_buf);
+                    a_scale_thread_copy.MoveSrcSliceWindow(
+                        a_scale_grid_desc, a_scale_thread_copy_step.At(Number<0>{}));
+                });
+
+                if(num_loop_per_scale == 1)
+                {
+                    a_scale_thread_copy.MoveSrcSliceWindow(
+                        a_scale_grid_desc, a_scale_thread_copy_step.At(Number<2>{}));
+                }
+                else
+                {
+                    a_scale_thread_copy.MoveSrcSliceWindow(
+                        a_scale_grid_desc, a_scale_thread_copy_step.At(Number<1>{}));
+                }
 
                 b_scale_thread_copy.Run(b_scale_grid_desc,
                                         b_scale_grid_buf,
@@ -471,7 +515,6 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale<BlockGemmPipelineScheduler::Intr
                                         make_tuple(I0, I0),
                                         b_scale_thread_buf);
 
-                a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc, a_scale_thread_copy_step);
                 b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
 
                 block_sync_lds();
@@ -528,7 +571,7 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale<BlockGemmPipelineScheduler::Intr
                             c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t));
                         c_thread_buf(Number<c_offset>{}) +=
                             c_thread_buf_per_scale[Number<t>{}] *
-                            type_convert<AccDataType>(a_scale_thread_buf[I0]) *
+                            type_convert<AccDataType>(a_scale_thread_buf[m0]) *
                             type_convert<AccDataType>(b_scale_thread_buf[I0]);
                     });
                 });
@@ -586,7 +629,7 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale<BlockGemmPipelineScheduler::Intr
                             c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t));
                         c_thread_buf(Number<c_offset>{}) +=
                             c_thread_buf_per_scale[Number<t>{}] *
-                            type_convert<AccDataType>(a_scale_thread_buf[I0]) *
+                            type_convert<AccDataType>(a_scale_thread_buf[m0]) *
                             type_convert<AccDataType>(b_scale_thread_buf[I0]);
                     });
                 });
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_ab_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_ab_scale.hpp
index de542866a6..fc0075b196 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_ab_scale.hpp
@@ -96,7 +96,8 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
                                         NPerXDL,
                                         MRepeat,
                                         NRepeat,
-                                        KPack>
+                                        KPack,
+                                        true>
 
 {
     using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
@@ -117,7 +118,8 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
                                                    NPerXDL,
                                                    MRepeat,
                                                    NRepeat,
-                                                   KPack>;
+                                                   KPack,
+                                                   true>;
     using Base::I0;
     using Base::KRepeat;
     using Base::xdlops_gemm;
@@ -177,11 +179,11 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
 
         constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num;
 
-        constexpr auto mfma_cycle            = NPerXDL == 16 ? 16 : 32;
-        constexpr auto ds_read_a_issue_cycle = 4;
-        // HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
-        constexpr auto ds_read_b_issue_cycle = 4;
-        // HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16 ? 8 : 4;
+        constexpr auto mfma_cycle = NPerXDL == 16 ? 16 : 32;
+        constexpr auto ds_read_a_issue_cycle =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_b_issue_cycle =
+            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16 ? 8 : 4;
         constexpr auto ds_read_a_mfma_rate =
             (mfma_cycle - 4 + 2 * ds_read_a_issue_cycle - 1) / (2 * ds_read_a_issue_cycle);
         constexpr auto ds_read_b_mfma_rate =
@@ -262,6 +264,7 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
     }
 
     template <bool HasMainLoop,
+              int NumKBlockPerScale,
               TailNumber TailNum,
               typename AGridDesc,
               typename ABlockDesc,
@@ -275,6 +278,7 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
               typename BGridBuffer,
               typename BBlockBuffer,
               typename BBlockTransferStep,
+              typename CScaleThreadDesc,
               typename CThreadBuffer,
               typename AScaleGridBuffer,
               typename AScaleGridDesc,
@@ -302,6 +306,7 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
         BBlockBuffer& b_block_buf,
         const BBlockTransferStep& b_block_copy_step,
         // CThread
+        const CScaleThreadDesc& c_scale_thread_desc,
         CThreadBuffer& c_thread_buf,
         // AScaleThreadCopy
         const AScaleGridDesc& a_scale_grid_desc,
@@ -316,12 +321,14 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
         const BScaleGridBuffer& b_scale_grid_buf,
         const BScaleThreadTransferStep& b_scale_thread_copy_step,
         // num_loop
-        index_t num_loop,
-        index_t num_loop_per_scale) const
+        index_t num_loop) const
     {
         __builtin_amdgcn_sched_barrier(0);
+        static_assert(CScaleThreadDesc{}.GetLength(Number<0>{}) == 1,
+                      "Pipeline v3 only support scaleblocksliceK=1");
+        static_assert(CScaleThreadDesc{}.GetLength(Number<2>{}) == 1,
+                      "Pipeline v3 only support scaleblocksliceN=1");
         // assume kperblock = scaleblockk
-        ignore            = num_loop_per_scale;
         auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
             a_thread_desc_.GetElementSpaceSize());
         auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
@@ -330,6 +337,8 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
             a_scale_thread_desc.GetElementSpaceSize());
         auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
             b_scale_thread_desc.GetElementSpaceSize());
+        auto c_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            c_scale_thread_desc.GetElementSpaceSize());
 
         // Global prefetch 1
         a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
@@ -338,11 +347,26 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
         a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
         b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
 
-        a_scale_thread_copy.Run(a_scale_grid_desc,
-                                a_scale_grid_buf,
-                                a_scale_thread_desc,
-                                make_tuple(I0, I0),
-                                a_scale_thread_buf);
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            a_scale_thread_copy.Run(a_scale_grid_desc,
+                                    a_scale_grid_buf,
+                                    a_scale_thread_desc,
+                                    make_tuple(m0, I0),
+                                    a_scale_thread_buf);
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<0>{}));
+        });
+
+        if constexpr(NumKBlockPerScale == 1)
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<2>{}));
+        }
+        else
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<1>{}));
+        }
 
         b_scale_thread_copy.Run(b_scale_grid_desc,
                                 b_scale_grid_buf,
@@ -350,8 +374,12 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
                                 make_tuple(I0, I0),
                                 b_scale_thread_buf);
 
-        a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc, a_scale_thread_copy_step);
         b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            c_scale_thread_buf(m0) = a_scale_thread_buf[m0] * b_scale_thread_buf[I0];
+        });
+
         // Local prefill 1
         a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
         b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
@@ -363,10 +391,44 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
         a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
         b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
 
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            a_scale_thread_copy.Run(a_scale_grid_desc,
+                                    a_scale_grid_buf,
+                                    a_scale_thread_desc,
+                                    make_tuple(m0, I0),
+                                    a_scale_thread_buf);
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<0>{}));
+        });
+
+        if constexpr(NumKBlockPerScale == 1)
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<2>{}));
+        }
+        else
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<1>{}));
+        }
+
+        b_scale_thread_copy.Run(b_scale_grid_desc,
+                                b_scale_grid_buf,
+                                b_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                b_scale_thread_buf);
+
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+
         // Initialize C
         c_thread_buf.Clear();
 
-        auto c_thread_buf_per_scale = remove_cvref_t<decltype(c_thread_buf)>();
+        StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                                  AccDataType,
+                                  1,
+                                  xdlops_gemm.GetRegSizePerXdlops(),
+                                  true>
+            c_thread_buf_per_scale;
 
         // Local prefetch 1
         block_sync_lds();
@@ -409,7 +471,10 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
 
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        c_thread_buf_per_scale.Clear();
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                .template AsType<AccDataType>()(Number<t>{}) = 0;
+                        });
                         static_for<0, KRepeat, 1>{}([&](auto k0) {
                             vector_type<ComputeDataType, KPack> a_thread_vec;
                             vector_type<ComputeDataType, KPack> b_thread_vec;
@@ -430,19 +495,23 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
                             xdlops_gemm.template Run<>(
                                 a_thread_vec.template AsType<mfma_input_type>(),
                                 b_thread_vec.template AsType<mfma_input_type>(),
-                                c_thread_buf_per_scale.GetVectorTypeReference(I0));
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
                         });
                         static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
                             constexpr index_t c_offset =
                                 c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t));
                             c_thread_buf(Number<c_offset>{}) +=
-                                c_thread_buf_per_scale[Number<t>{}] *
-                                type_convert<AccDataType>(a_scale_thread_buf[I0]) *
-                                type_convert<AccDataType>(b_scale_thread_buf[I0]);
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<AccDataType>()[Number<t>{}] *
+                                type_convert<AccDataType>(c_scale_thread_buf[m0]);
                         });
                     });
                 });
 
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    c_scale_thread_buf(m0) = a_scale_thread_buf[m0] * b_scale_thread_buf[I0];
+                });
+
                 block_sync_lds();
                 static_for<0, KRepeat, 1>{}([&](auto k) {
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
@@ -462,11 +531,27 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
                                            b_thread_buf);
                     });
                 });
-                a_scale_thread_copy.Run(a_scale_grid_desc,
-                                        a_scale_grid_buf,
-                                        a_scale_thread_desc,
-                                        make_tuple(I0, I0),
-                                        a_scale_thread_buf);
+
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    a_scale_thread_copy.Run(a_scale_grid_desc,
+                                            a_scale_grid_buf,
+                                            a_scale_thread_desc,
+                                            make_tuple(m0, I0),
+                                            a_scale_thread_buf);
+                    a_scale_thread_copy.MoveSrcSliceWindow(
+                        a_scale_grid_desc, a_scale_thread_copy_step.At(Number<0>{}));
+                });
+
+                if constexpr(NumKBlockPerScale == 1)
+                {
+                    a_scale_thread_copy.MoveSrcSliceWindow(
+                        a_scale_grid_desc, a_scale_thread_copy_step.At(Number<2>{}));
+                }
+                else
+                {
+                    a_scale_thread_copy.MoveSrcSliceWindow(
+                        a_scale_grid_desc, a_scale_thread_copy_step.At(Number<1>{}));
+                }
 
                 b_scale_thread_copy.Run(b_scale_grid_desc,
                                         b_scale_grid_buf,
@@ -474,7 +559,6 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
                                         make_tuple(I0, I0),
                                         b_scale_thread_buf);
 
-                a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc, a_scale_thread_copy_step);
                 b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
                 HotLoopScheduler();
                 __builtin_amdgcn_sched_barrier(0);
@@ -487,7 +571,10 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
         {
             static_for<0, MRepeat, 1>{}([&](auto m0) {
                 static_for<0, NRepeat, 1>{}([&](auto n0) {
-                    c_thread_buf_per_scale.Clear();
+                    static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                        c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                            .template AsType<AccDataType>()(Number<t>{}) = 0;
+                    });
                     static_for<0, KRepeat, 1>{}([&](auto k0) {
                         vector_type<ComputeDataType, KPack> a_thread_vec;
                         vector_type<ComputeDataType, KPack> b_thread_vec;
@@ -507,15 +594,15 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
                         xdlops_gemm.template Run<>(
                             a_thread_vec.template AsType<mfma_input_type>(),
                             b_thread_vec.template AsType<mfma_input_type>(),
-                            c_thread_buf_per_scale.GetVectorTypeReference(I0));
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
                     });
                     static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
                         constexpr index_t c_offset =
                             c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t));
                         c_thread_buf(Number<c_offset>{}) +=
-                            c_thread_buf_per_scale[Number<t>{}] *
-                            type_convert<AccDataType>(a_scale_thread_buf[I0]) *
-                            type_convert<AccDataType>(b_scale_thread_buf[I0]);
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                .template AsType<AccDataType>()[Number<t>{}] *
+                            type_convert<AccDataType>(c_scale_thread_buf[m0]);
                     });
                 });
             });
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_gather.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_gather.hpp
new file mode 100644
index 0000000000..92aef65388
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_gather.hpp
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp"
+
+namespace ck {
+
+/**
+ * @brief Blockwise data transfer
+ *
+ * This version does following things to avoid scratch memory issue
+ * 1. Use StaticallyIndexedArray instead of C array for thread buffer
+ * 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
+ * 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
+ *
+ */
+template <typename ThreadGroup,
+          typename SrcElementwiseOperation,
+          typename DstElementwiseOperation,
+          InMemoryDataOperationEnum DstInMemOp,
+          typename BlockSliceLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t SrcScalarPerVector,
+          index_t DstScalarPerVector,
+          index_t SrcScalarStrideInVector,
+          index_t DstScalarStrideInVector,
+          bool ThreadTransferSrcResetCoordinateAfterRun,
+          bool ThreadTransferDstResetCoordinateAfterRun,
+          typename IndexType,
+          index_t GatherDim        = 1,
+          index_t NumThreadScratch = 1>
+struct ThreadGroupTensorSliceTransfer_v4r1_gather
+{
+    static constexpr auto I0                   = Number<0>{};
+    static constexpr index_t nDim              = remove_reference_t<SrcDesc>::GetNumOfDimension();
+    static constexpr auto thread_slice_lengths = BlockSliceLengths{} / ThreadClusterLengths{};
+    static constexpr index_t gather_num        = thread_slice_lengths.At(Number<GatherDim>{});
+    using Index                                = MultiIndex<nDim>;
+
+    __device__ constexpr ThreadGroupTensorSliceTransfer_v4r1_gather(
+        const SrcDesc& src_desc,
+        const Index& src_block_slice_origin,
+        const SrcElementwiseOperation& src_element_op,
+        const DstDesc& dst_desc,
+        const Index& dst_block_slice_origin,
+        const DstElementwiseOperation& dst_element_op,
+        const StaticallyIndexedArray<IndexType, gather_num>& gather_offsets)
+        : threadwise_transfer_(src_desc,
+                               make_zero_multi_index<nDim>(),
+                               src_element_op,
+                               dst_desc,
+                               make_zero_multi_index<nDim>(),
+                               dst_element_op,
+                               gather_offsets)
+
+    {
+        static_assert(nDim == remove_cvref_t<SrcDesc>::GetNumOfDimension() &&
+                          nDim == remove_cvref_t<DstDesc>::GetNumOfDimension() &&
+                          nDim == ThreadClusterLengths::Size() &&
+                          nDim == ThreadClusterArrangeOrder::Size() &&
+                          nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
+                      "wrong! nDim not consistent");
+
+        static_assert(
+            is_same<BlockSliceLengths, decltype(thread_slice_lengths * ThreadClusterLengths{})>{},
+            "wrong! threads should be mapped to cover entire slicing window");
+
+        static_assert(ThreadGroup::GetNumOfThread() >= thread_cluster_desc_.GetElementSize(),
+                      "wrong! ThreadGroup::GetNumOfThread() too small");
+
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
+                make_multi_index(ThreadGroup::GetThreadId()));
+            threadwise_transfer_.SetSrcSliceOrigin(
+                src_desc, src_block_slice_origin + thread_cluster_idx * thread_slice_lengths);
+            threadwise_transfer_.SetDstSliceOrigin(
+                dst_desc, dst_block_slice_origin + thread_cluster_idx * thread_slice_lengths);
+        }
+    }
+
+    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_block_slice_origin)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
+                make_multi_index(ThreadGroup::GetThreadId()));
+
+            const auto thread_data_idx_begin = thread_cluster_idx * thread_slice_lengths;
+            threadwise_transfer_.SetSrcSliceOrigin(src_desc,
+                                                   src_block_slice_origin + thread_data_idx_begin);
+        }
+    }
+
+    template <typename SeqIdx, index_t ThreadScratchId = 0>
+    __device__ constexpr auto GetSrcThreadScratchIdx()
+    {
+        return threadwise_transfer_.template GetSrcThreadScratchIdx<SeqIdx, ThreadScratchId>();
+    }
+
+    template <typename SrcBuffer, index_t ThreadScratchId = 0>
+    __device__ void RunRead(const SrcDesc& src_desc,
+                            const SrcBuffer& src_buf,
+                            Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunRead(src_desc, src_buf, thread_scratch_id);
+        }
+    }
+
+    template <typename DstBuffer, index_t ThreadScratchId = 0>
+    __device__ void RunWrite(const DstDesc& dst_desc,
+                             DstBuffer& dst_buf,
+                             Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunWrite(dst_desc, dst_buf, thread_scratch_id);
+        }
+    }
+
+    template <typename SrcBuffer, typename DstBuffer, index_t ThreadScratchId>
+    __device__ void Run(const SrcDesc& src_desc,
+                        const SrcBuffer& src_buf,
+                        const DstDesc& dst_desc,
+                        DstBuffer& dst_buf,
+                        Number<ThreadScratchId> thread_scratch_id)
+    {
+        RunRead(src_desc, src_buf, thread_scratch_id);
+        RunWrite(dst_desc, dst_buf, thread_scratch_id);
+    }
+
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrcSliceWindow(src_desc, step);
+        }
+    }
+
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveDstSliceWindow(dst_desc, step);
+        }
+    }
+
+    private:
+    static constexpr auto thread_cluster_desc_ =
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+
+    using ThreadwiseTransfer =
+        ThreadwiseTensorSliceTransfer_v3r1_gather<decltype(thread_slice_lengths),
+                                                  SrcElementwiseOperation,
+                                                  DstElementwiseOperation,
+                                                  DstInMemOp,
+                                                  SrcData,
+                                                  DstData,
+                                                  SrcDesc,
+                                                  DstDesc,
+                                                  SrcDimAccessOrder,
+                                                  DstDimAccessOrder,
+                                                  SrcVectorDim,
+                                                  DstVectorDim,
+                                                  SrcScalarPerVector,
+                                                  DstScalarPerVector,
+                                                  SrcScalarStrideInVector,
+                                                  DstScalarStrideInVector,
+                                                  ThreadTransferSrcResetCoordinateAfterRun,
+                                                  ThreadTransferDstResetCoordinateAfterRun,
+                                                  IndexType,
+                                                  GatherDim,
+                                                  NumThreadScratch>;
+
+    ThreadwiseTransfer threadwise_transfer_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3_scatter.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3_scatter.hpp
new file mode 100644
index 0000000000..bee0b01a74
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3_scatter.hpp
@@ -0,0 +1,241 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp"
+#include "ck/utility/is_detected.hpp"
+
+namespace ck {
+
+// Thread-group level multi-source, multi-destination tensor slice data movement
+// Assume:
+//   1. All sources and destinations are DynamicBuffer
+//   2. Same VectorDim and ScalerPerVector for all sources and destinations
+//   3. DstInMemOps are per destination tensor
+//   4. ThreadTransferSrcResetCoordinateAfterRunFlags are per source tensor
+//   5. ThreadTransferDstResetCoordinateAfterRunFlags are per destination tensor
+//
+// Does following things to avoid scratch memory issue
+//   1. Pass tensor descritpors by reference (or tuple of references)
+//   2. Does not keep reference to tensor descriptor
+//   3. Does not construct new tensor coordinate when call Run()
+template <typename ThreadGroup,
+          typename SrcDatas,
+          typename DstDatas,
+          typename SrcDescs,
+          typename DstDescs,
+          typename ElementwiseOperation,
+          typename DstInMemOps, // Sequence<InMemoryDataOperationEnum ...>
+          typename SliceLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          typename SrcScalarPerVectors,
+          index_t DstScalarPerVector,
+          typename ThreadTransferSrcResetCoordinateAfterRunFlags,
+          typename ThreadTransferDstResetCoordinateAfterRunFlags,
+          typename IndexType,
+          index_t ScatterDim       = 1,
+          bool OutputScatter       = true,
+          index_t ScatterWeightIdx = 3,
+          index_t NumThreadScratch = 1>
+struct ThreadGroupTensorSliceTransfer_v7r3_scatter
+{
+    static constexpr index_t nDim =
+        remove_cvref_t<tuple_element_t<0, SrcDescs>>::GetNumOfDimension();
+
+    static constexpr index_t mod_num =
+        ThreadClusterLengths{}.At(Number<3>{}); // Dirty HACK FELIX, TODO fix
+    static constexpr index_t nSrc = remove_cvref_t<SrcDescs>::Size();
+    static constexpr index_t nDst = remove_cvref_t<DstDescs>::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    static constexpr auto thread_slice_lengths = SliceLengths{} / ThreadClusterLengths{};
+    static constexpr index_t scatter_num       = thread_slice_lengths.At(Number<ScatterDim>{});
+
+    __device__ constexpr ThreadGroupTensorSliceTransfer_v7r3_scatter(
+        const SrcDescs& src_descs,
+        const StaticallyIndexedArray<Index, nSrc>& src_block_slice_origins,
+        const DstDescs& dst_descs,
+        const StaticallyIndexedArray<Index, nDst>& dst_block_slice_origins,
+        const ElementwiseOperation& element_op)
+        : threadwise_transfer_(src_descs,
+                               StaticallyIndexedArray<Index, nSrc>{},
+                               dst_descs,
+                               StaticallyIndexedArray<Index, nDst>{},
+                               element_op)
+    {
+        static_assert(nSrc == SrcDatas::Size() && nSrc == SrcDescs::Size() &&
+                          nSrc == ThreadTransferSrcResetCoordinateAfterRunFlags::Size() &&
+                          nDst == DstDatas::Size() && nDst == DstDescs::Size() &&
+                          nDst == ThreadTransferDstResetCoordinateAfterRunFlags::Size(),
+                      "wrong!");
+
+        static_for<0, nSrc, 1>{}([&](auto i) {
+            static_assert(
+                nDim == remove_cvref_t<tuple_element_t<i.value, SrcDescs>>::GetNumOfDimension(),
+                "wrong!");
+        });
+
+        static_for<0, nDst, 1>{}([&](auto i) {
+            static_assert(
+                nDim == remove_cvref_t<tuple_element_t<i.value, DstDescs>>::GetNumOfDimension(),
+                "wrong!");
+        });
+
+        static_assert(nDim == ThreadClusterLengths::Size() &&
+                          nDim == ThreadClusterArrangeOrder::Size() &&
+                          nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
+                      "wrong! nDim not consistent");
+
+        static_assert(
+            is_same<SliceLengths, decltype(thread_slice_lengths * ThreadClusterLengths{})>{},
+            "wrong! threads should be mapped to cover entire slicing window");
+
+        static_assert(ThreadGroup::GetNumOfThread() >= thread_cluster_desc_.GetElementSize(),
+                      "wrong! ThreadGroup::GetNumOfThread() too small");
+
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto src_thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
+                make_multi_index(ThreadGroup::GetThreadId()));
+            const auto src_thread_slice_origins = generate_tuple(
+                [&](auto i) {
+                    return src_block_slice_origins[i] +
+                           src_thread_cluster_idx * thread_slice_lengths;
+                },
+                Number<nSrc>{});
+
+            const auto dst_thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
+                make_multi_index(OutputScatter ? ThreadGroup::GetThreadId() % mod_num
+                                               : ThreadGroup::GetThreadId()));
+            const auto dst_thread_slice_origins = generate_tuple(
+                [&](auto i) {
+                    return dst_block_slice_origins[i] +
+                           dst_thread_cluster_idx * thread_slice_lengths;
+                },
+                Number<nDst>{});
+
+            threadwise_transfer_.SetSrcSliceOrigins(src_descs, src_thread_slice_origins);
+            threadwise_transfer_.SetDstSliceOrigins(dst_descs, dst_thread_slice_origins);
+        }
+    }
+
+    template <typename SrcBuffers, index_t ThreadScratchId = 0>
+    __device__ void RunRead(const SrcDescs& src_descs,
+                            const SrcBuffers& src_bufs,
+                            Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunRead(src_descs, src_bufs, thread_scratch_id);
+        }
+    }
+
+    template <typename T>
+    using is_tuple = decltype(std::declval<T&>().IsTuple());
+
+    template <typename DstBuffers, index_t ThreadScratchId = 0>
+    __device__ void RunWrite(const DstDescs& dst_descs,
+                             DstBuffers dst_bufs,
+                             StaticallyIndexedArray<IndexType, scatter_num>& scatter_offsets,
+                             Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            if constexpr(is_detected<is_tuple, decltype(dst_bufs)>::value)
+                threadwise_transfer_.RunWrite(
+                    dst_descs, dst_bufs, scatter_offsets, thread_scratch_id);
+            else
+                threadwise_transfer_.RunWrite(
+                    dst_descs, tie(dst_bufs), scatter_offsets, thread_scratch_id);
+        }
+    }
+
+    template <typename SrcBuffers, typename DstBuffers>
+    __device__ void Run(const SrcDescs& src_descs,
+                        const SrcBuffers& src_bufs,
+                        const DstDescs& dst_descs,
+                        DstBuffers dst_bufs,
+                        StaticallyIndexedArray<IndexType, scatter_num>& scatter_offsets)
+    {
+        RunRead(src_descs, src_bufs);
+        RunWrite(dst_descs, dst_bufs, scatter_offsets);
+    }
+
+    template <index_t ISrc>
+    __device__ void
+    MoveSrcSliceWindow(const SrcDescs& src_descs, Number<ISrc> iSrc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrcSliceWindow(src_descs, iSrc, step);
+        }
+    }
+
+    __device__ void MoveSrcSliceWindow(const SrcDescs& src_descs, const Index& step)
+    {
+        static_for<0, SrcDescs::Size(), 1>{}(
+            [&](auto i) { MoveSrcSliceWindow(src_descs, i, step); });
+    }
+
+    template <index_t IDst>
+    __device__ void
+    MoveDstSliceWindow(const DstDescs& dst_descs, Number<IDst> iDst, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveDstSliceWindow(dst_descs, iDst, step);
+        }
+    }
+
+    __device__ void MoveDstSliceWindow(const DstDescs& dst_descs, const Index& step)
+    {
+        static_for<0, DstDescs::Size(), 1>{}(
+            [&](auto i) { MoveDstSliceWindow(dst_descs, i, step); });
+    }
+
+    private:
+    static constexpr auto thread_cluster_desc_ =
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+
+    using ThreadwiseTransfer =
+        ThreadwiseTensorSliceTransfer_v7r3_scatter<SrcDatas,
+                                                   DstDatas,
+                                                   SrcDescs,
+                                                   DstDescs,
+                                                   ElementwiseOperation,
+                                                   DstInMemOps,
+                                                   decltype(thread_slice_lengths),
+                                                   SrcDimAccessOrder,
+                                                   DstDimAccessOrder,
+                                                   SrcVectorDim,
+                                                   DstVectorDim,
+                                                   SrcScalarPerVectors,
+                                                   DstScalarPerVector,
+                                                   ThreadTransferSrcResetCoordinateAfterRunFlags,
+                                                   ThreadTransferDstResetCoordinateAfterRunFlags,
+                                                   IndexType,
+                                                   ScatterDim,
+                                                   OutputScatter,
+                                                   ScatterWeightIdx,
+                                                   NumThreadScratch>;
+
+    ThreadwiseTransfer threadwise_transfer_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
index 3c79b92ec8..ef0b5286ac 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
@@ -125,6 +125,7 @@ struct DeviceGemmMultipleDSplitKBPreShuffle : public BaseOperator
 {
     static constexpr index_t NumDTensor = DsDataType::Size();
 
+#ifndef CK_CODE_GEN_RTC
     virtual std::unique_ptr<BaseArgument>
     MakeArgumentPointer(const void* p_a,
                         const void* p_b,
@@ -145,6 +146,7 @@ struct DeviceGemmMultipleDSplitKBPreShuffle : public BaseOperator
     virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 
     virtual int GetPreShuffleParameters() = 0;
+#endif
 };
 
 } // namespace device
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_mx.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_mx.hpp
new file mode 100644
index 0000000000..e89185a35c
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_mx.hpp
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename CDataType,
+          index_t ScaleBlockSize,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceGemmMX : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_a_scale,
+                        const void* p_b,
+                        const void* p_b_scale,
+                        void* p_c,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideAScale,
+                        ck::index_t StrideB,
+                        ck::index_t StrideBScale,
+                        ck::index_t StrideC,
+                        ck::index_t KBatch,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CElementwiseOperation c_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp
index 78d8aa997e..b251fb97b9 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -114,6 +114,40 @@ struct DeviceGemmV2BScale : public BaseOperator
     virtual ck::index_t GetKPerBlock() = 0;
 };
 
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceGemmV2BPreshuffle : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        void* p_c,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB,
+                        ck::index_t StrideC,
+                        ck::index_t KSplit,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CElementwiseOperation c_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+
+    virtual bool GetPermuteA()            = 0;
+    virtual bool GetPermuteB()            = 0;
+    virtual ck::index_t GetKPerBlock()    = 0;
+    virtual int GetPreShuffleParameters() = 0;
+};
+
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
index 2abf1d5a10..9c44bda5ca 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -59,7 +59,8 @@ struct DeviceGroupedConvBwdDataMultipleD : public BaseOperator
         const std::array<index_t, NDimSpatial>& input_right_pads,
         const AElementwiseOperation& a_element_op,
         const BElementwiseOperation& b_element_op,
-        const CDEElementwiseOperation& cde_element_op) = 0;
+        const CDEElementwiseOperation& cde_element_op,
+        const ck::index_t split_k = 1) = 0;
 
     virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
index f6c228fb7b..d38698af4b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
@@ -7,6 +7,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
index 30ae72a63e..de7d67f08b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -7,6 +7,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
index 2662e5c360..bae5c6019d 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -7,6 +7,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
index b4ab96d397..e846b0630b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
@@ -614,7 +614,6 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
         return true;
     }
 
-#ifndef __HIPCC_RTC__
     static constexpr bool
     IsSupported(index_t MRaw_, index_t NRaw_, index_t KRaw_, index_t Gemm1NRaw_)
     {
@@ -705,6 +704,7 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
         return true;
     }
 
+#ifndef __HIPCC_RTC__
     static bool IsSupportedArgument(const Argument& arg)
     {
         if(!ck::is_xdl_supported())
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl_fpAintB_b_scale.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl_fpAintB_b_scale.hpp
index 963f0edd08..7d9555dc82 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl_fpAintB_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl_fpAintB_b_scale.hpp
@@ -224,12 +224,20 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
         PermuteA,
         PermuteB>;
 
+    static constexpr index_t APackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
     static constexpr index_t BPackedSize = []() {
         if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
             return 2;
         else
             return 1;
     }();
+
     struct ComputePtrOffsetOfStridedBatch
     {
         ComputePtrOffsetOfStridedBatch(index_t BatchStrideA,
@@ -352,10 +360,10 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                     const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
                         arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
 
-                    auto size_a_buffer =
-                        a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType);
-                    auto size_b_buffer =
-                        b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
+                    auto size_a_buffer = a_grid_desc_ak0_m_ak1.GetElementSpaceSize() *
+                                         sizeof(ADataType) / APackedSize;
+                    auto size_b_buffer = b_grid_desc_bk0_n_bk1.GetElementSpaceSize() *
+                                         sizeof(BDataType) / BPackedSize;
 
                     ck::utility::RotatingMemWrapper<Argument> rotating_mem(
                         arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
index 0b73317c5e..d4f89b3e09 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -7,6 +7,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
index 13eb23574f..a8eb73d730 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -7,6 +7,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
index 28778d825b..6eb9281d30 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -8,6 +8,7 @@
 #include <vector>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
index 7fa231d4f4..5fad21f521 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -7,6 +7,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
index 3be7313d2b..c7aa54f1d9 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -7,6 +7,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
index 8aa20f7ad4..68ec8187a4 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef DEVICE_CONV3D_FWD_XDL_HPP
 #define DEVICE_CONV3D_FWD_XDL_HPP
@@ -10,6 +10,7 @@
 #include "device.hpp"
 #include "device_conv_fwd.hpp"
 #include "common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "tensor_layout.hpp"
 #include "convolution_forward_specialization.hpp"
 #include "tensor_descriptor.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
index 1edae33be3..ddabd61c3d 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
@@ -7,6 +7,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp
index de8f35a640..2881036bee 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -7,6 +7,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
index eb0fb55f5d..7faee161c1 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -7,6 +7,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp
index 480402b7e1..d5fec7201a 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp
@@ -15,6 +15,7 @@
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/flush_cache.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -177,14 +178,57 @@ struct DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
             const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
 
             const auto Run = [&](const auto& kernel) {
-                if(arg.KBatch > 1)
-                    hipGetErrorString(hipMemsetAsync(arg.p_c_grid,
-                                                     0,
-                                                     arg.M * arg.N * sizeof(CDataType),
-                                                     stream_config.stream_id_));
+                if(stream_config.flush_cache)
+                {
+                    Argument arg_ = arg;
 
-                ave_time = launch_and_time_kernel(
-                    stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);
+                    const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
+                        arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
+                    const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
+                        arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
+
+                    auto size_a_buffer =
+                        a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType);
+                    auto size_b_buffer =
+                        b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
+
+                    ck::utility::RotatingMemWrapper<Argument> rotating_mem(
+                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
+                    rotating_mem.Print();
+
+                    auto run_flush_cache = [&]() {
+                        // flush icache
+                        ck::utility::flush_icache();
+                        // rotating mem
+                        rotating_mem.Next();
+                        // clear c mem
+                        if(arg_.KBatch > 1)
+                            hipGetErrorString(hipMemsetAsync(arg_.p_c_grid,
+                                                             0,
+                                                             arg_.M * arg_.N * sizeof(CDataType),
+                                                             stream_config.stream_id_));
+                    };
+
+                    ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                        stream_config,
+                        run_flush_cache,
+                        kernel,
+                        dim3(gdx, gdy, gdz),
+                        dim3(BlockSize),
+                        0,
+                        arg_);
+                }
+                else
+                {
+                    if(arg.KBatch > 1)
+                        hipGetErrorString(hipMemsetAsync(arg.p_c_grid,
+                                                         0,
+                                                         arg.M * arg.N * sizeof(CDataType),
+                                                         stream_config.stream_id_));
+
+                    ave_time = launch_and_time_kernel(
+                        stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);
+                }
             };
 
             constexpr index_t minimum_occupancy =
@@ -195,7 +239,7 @@ struct DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
 
             if(has_main_k_block_loop)
             {
-                // Tail number always 1
+                // Tail number always full
                 if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1 ||
                              BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
                 {
@@ -208,127 +252,13 @@ struct DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
                         Run(kernel);
                     }
                 }
-                // Tail number could be One to Seven
-                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
-                {
-                    {
-                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
-                        {
-                            const auto kernel =
-                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                            true,
-                                                            InMemoryDataOperationEnum::Set,
-                                                            minimum_occupancy,
-                                                            TailNumber::One>;
-                            Run(kernel);
-                        }
-                        else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                                TailNumber::Full)
-                        {
-                            const auto kernel =
-                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                            true,
-                                                            InMemoryDataOperationEnum::Set,
-                                                            minimum_occupancy,
-                                                            TailNumber::Full>;
-                            Run(kernel);
-                        }
-
-                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
-                        {
-                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two)
-                            {
-                                const auto kernel =
-                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                                true,
-                                                                InMemoryDataOperationEnum::Set,
-                                                                minimum_occupancy,
-                                                                TailNumber::Two>;
-                                Run(kernel);
-                            }
-                        }
-
-                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
-                        {
-                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                               TailNumber::Three)
-                            {
-                                const auto kernel =
-                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                                true,
-                                                                InMemoryDataOperationEnum::Set,
-                                                                minimum_occupancy,
-                                                                TailNumber::Three>;
-                                Run(kernel);
-                            }
-                        }
-
-                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
-                        {
-                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                               TailNumber::Four)
-                            {
-                                const auto kernel =
-                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                                true,
-                                                                InMemoryDataOperationEnum::Set,
-                                                                minimum_occupancy,
-                                                                TailNumber::Four>;
-                                Run(kernel);
-                            }
-                        }
-
-                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
-                        {
-                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                               TailNumber::Five)
-                            {
-                                const auto kernel =
-                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                                true,
-                                                                InMemoryDataOperationEnum::Set,
-                                                                minimum_occupancy,
-                                                                TailNumber::Five>;
-                                Run(kernel);
-                            }
-                        }
-
-                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
-                        {
-                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six)
-                            {
-                                const auto kernel =
-                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                                true,
-                                                                InMemoryDataOperationEnum::Set,
-                                                                minimum_occupancy,
-                                                                TailNumber::Six>;
-                                Run(kernel);
-                            }
-                        }
-
-                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
-                        {
-                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                               TailNumber::Seven)
-                            {
-                                const auto kernel =
-                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                                true,
-                                                                InMemoryDataOperationEnum::Set,
-                                                                minimum_occupancy,
-                                                                TailNumber::Seven>;
-                                Run(kernel);
-                            }
-                        }
-                    }
-                }
             }
             else
             {
                 // Tail number always 1
                 if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
                 {
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Full)
                     {
                         const auto kernel =
                             kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
@@ -337,6 +267,16 @@ struct DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
                                                         minimum_occupancy>;
                         Run(kernel);
                     }
+                    else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                        false,
+                                                        InMemoryDataOperationEnum::Set,
+                                                        minimum_occupancy,
+                                                        TailNumber::Odd>;
+                        Run(kernel);
+                    }
                 }
             }
             return ave_time;
@@ -363,10 +303,11 @@ struct DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
             return false;
         }
 
-        if(ScaleBlockM % MPerBlock != 0 || ScaleBlockN % NPerBlock != 0 || ScaleBlockK != KPerBlock)
-        {
-            return false;
-        }
+        // if(ScaleBlockM % MPerBlock != 0 || ScaleBlockN % NPerBlock != 0 || ScaleBlockK !=
+        // KPerBlock)
+        // {
+        //     return false;
+        // }
 
         if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
                                                        GemmSpec == GemmSpecialization::NKPadding ||
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp
new file mode 100644
index 0000000000..1ef8a9b8ad
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp
@@ -0,0 +1,542 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/flush_cache.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+/// @brief \"Universal\" GEMM operation with SplitK support.
+///
+/// @par Overview
+///         This GEMM operation implements the following mathematical equation:
+///         C{M,N} = C_op(A_op(A{M,K}) * B_op(B{K,N}))
+///         Where A, B are input tensors and C is the output tensor. The A/B/C_op are
+///         elementwise operations applied to the A, B, and C tensors, respectively.
+///         The \"universal\" gemm comes with multiple pipelines optimized for different usage
+///         scenarios. That's why it's called \"universal\". It's universal through it's design
+///         and versatilty.
+///
+/// @note   This Kernel implementation supports SplitK algorithm. It can be configured
+///         to split the dot product accumulated over the K dimension into multiple working groups.
+///         The partial products of different workgroups are then reduced using the AtomicAdd
+///         operation.
+///
+/// @tparam ALayout     A tensor data layout.
+/// @tparam BLayout     B tensor data layout.
+/// @tparam CLayout     C tensor data layout.
+/// @tparam ADataType   A tensor data type.
+/// @tparam BDataType   B tensor data type.
+/// @tparam CDataType   C tensor data type.
+/// @tparam AccDataType The accumulation data type related to the hardware
+///                         matrix-multiplication instruction.
+/// @tparam CShuffleDataType The data type used to store matrix-multiplication results into
+///                          LDS memory during \"CShuffle\" data layout optimization.
+/// @tparam AElementwiseOperation Elementwise operation applied to the A input tensor elements.
+/// @tparam BElementwiseOperation Elementwise operation applied to the B input tensor elements.
+/// @tparam CElementwiseOperation Elementwise operation applied to the C output tensor
+///                               (after GEMM).
+/// @tparam GemmSpec    Determines used "padding" version.
+/// @tparam BlockSize   The number of threads within workgroup.
+/// @tparam MPerBlock   The input/output data tile size in the M dimension.
+/// @tparam NPerBlock   The input/output data tile size in the N dimension.
+/// @tparam KPerBlock   The input data tile size in the K dimension.
+/// @tparam AK1         The vector load size from global memory for A tensor.
+/// @tparam BK1         The vector load size from global memory for B tensor.
+/// @tparam MPerWmma    M size of Wave Matrix Multiply Accumulate (WMMA) instruction.
+/// @tparam NPerWmma    N size of Wave Matrix Multiply Accumulate (WMMA) instruction.
+/// @tparam MRepeat     The number of iterations in the M dimension over output tile per wavefront.
+/// @tparam NRepeat     The number of iterations in the N dimension over output tile per wavefront.
+/// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question, "How many threads can be
+///                                                      arranged on each input data axis?"
+/// @tparam ABlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam ABlockTransferSrcVectorDim   The index of axis on which we could do vectorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam ABlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam ABlockTransferDstScalarPerVector_AK1 The size of vectorized store into LDS memory.
+/// @tparam ABlockLdsExtraM                      Whether to use padding for LDS or not. With
+///                                              universal GEMM there's no need for padding.
+/// @tparam BBlockTransferThreadClusterLengths_BK0_N_BK1 Spatial thread distribution over the input
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question: "How many threads to
+///                                                      arrange on each input data axis?"
+/// @tparam BBlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam BBlockTransferSrcAccessOrder he order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam BBlockTransferSrcVectorDim  The index of axis on which we could do vectorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam BBlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam BBlockTransferDstScalarPerVector_BK1 The size of vectorized store into LDS memory.
+/// @tparam BBlockLdsExtraN                      Whether to use padding for LDS or not. With
+///                                              universal GEMM there's no need for padding.
+/// @tparam CShuffleMRepeatPerShuffle   The number of matrix-multiplication instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in M dimension.
+/// @tparam CShuffleNRepeatPerShuffle   The number of matrix-multiplication instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in N dimension.
+/// @tparam CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock The spatial
+///                                         thread distribution used for storing data into output
+///                                         tensor across output data layout dimensions.
+/// @tparam CShuffleBlockTransferScalarPerVector_NPerBlock The size of vectorized memory access.
+///                                         Used when storing data to output tensor.
+/// @tparam BlkGemmPipeSched    The version of blockwise-gemm pipeline scheduler (interwave or
+///                             intrawave).
+/// @tparam BlkGemmPipelineVer  The version of blockwise-gemm pipeline.
+/// @tparam ComputeTypeA    Data type used for A input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam ComputeTypeB    Data type used for B input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam PermuteA            Whether the A input tensor has gridwise-gemm friendly data layout
+///                             in global memory. Currently not supported!
+/// @tparam PermuteB            Whether the B input tensor has gridwise-gemm friendly data layout
+///                             in global memory (pre-shuffled).
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerWmma,
+          index_t NPerWmma,
+          index_t MRepeat,
+          index_t NRepeat,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMRepeatPerShuffle,
+          index_t CShuffleNRepeatPerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          typename ComputeTypeA                       = CDataType,
+          typename ComputeTypeB                       = ComputeTypeA,
+          bool PermuteA                               = false,
+          bool PermuteB                               = false>
+struct DeviceGemm_Wmma_CShuffleV3 : public DeviceGemmV2<ALayout,
+                                                        BLayout,
+                                                        CLayout,
+                                                        ADataType,
+                                                        BDataType,
+                                                        CDataType,
+                                                        AElementwiseOperation,
+                                                        BElementwiseOperation,
+                                                        CElementwiseOperation>
+{
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemm_wmma_cshuffle_v3<
+        ALayout,
+        BLayout,
+        CLayout,
+        ADataType,
+        BDataType,
+        AccDataType,
+        CShuffleDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        GemmSpec,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerWmma,
+        NPerWmma,
+        MRepeat,
+        NRepeat,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMRepeatPerShuffle,
+        CShuffleNRepeatPerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        ComputeTypeA,
+        ComputeTypeB,
+        PermuteA,
+        PermuteB>;
+
+    using Argument = typename GridwiseGemm::Argument;
+
+    /// @brief  Helper structure responsible for kernel invocation.
+    ///
+    /// @paragraph  The `Invoker` class is responsible for preparation and invocation of actual GPU
+    ///             kernel function. It usually determines the launched grid size prepares kernel
+    ///             arguments as well as perform specific kernel configuration selection based on
+    ///             runtime arguments.
+    ///
+    /// @note       If appropriately configured it may measure kernel execution time.
+    ///
+    struct Invoker : public BaseInvoker
+    {
+        /// @brief  This function issues GPU kernel execution.
+        /// @param arg           The GPU kernel arguments.
+        /// @param stream_config The HIP stream configuration helper structure.
+        /// @return              The kernel's average execution time (if time measurement is
+        ///                      enabled).
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+                GridwiseGemm::BlockwiseGemmPipe::HotLoopInstList::Print();
+            }
+
+            if(!GridwiseGemm::CheckValidity(arg))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N, arg.KBatch);
+
+            float ave_time = 0;
+
+            index_t k_grain = arg.KBatch * KPerBlock;
+            index_t K_split = (arg.K + k_grain - 1) / k_grain * KPerBlock;
+
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
+
+            const auto Run = [&](const auto& kernel) {
+                if(stream_config.flush_cache)
+                {
+                    Argument arg_ = arg;
+
+                    const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
+                        arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
+                    const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
+                        arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
+
+                    auto size_a_buffer =
+                        a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType);
+                    auto size_b_buffer =
+                        b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
+
+                    ck::utility::RotatingMemWrapper<Argument> rotating_mem(
+                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
+                    rotating_mem.Print();
+
+                    auto run_flush_cache = [&]() {
+                        // flush icache
+                        ck::utility::flush_icache();
+                        // rotating mem
+                        rotating_mem.Next();
+                        // clear c mem
+                        if(arg_.KBatch > 1)
+                            HIP_CHECK_ERROR(hipMemsetAsync(arg_.p_c_grid,
+                                                           0,
+                                                           arg_.M * arg_.N * sizeof(CDataType),
+                                                           stream_config.stream_id_));
+                    };
+
+                    ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                        stream_config,
+                        run_flush_cache,
+                        kernel,
+                        dim3(gdx, gdy, gdz),
+                        dim3(BlockSize),
+                        0,
+                        arg_);
+                }
+                else
+                {
+                    if(arg.KBatch > 1)
+                        HIP_CHECK_ERROR(hipMemsetAsync(arg.p_c_grid,
+                                                       0,
+                                                       arg.M * arg.N * sizeof(CDataType),
+                                                       stream_config.stream_id_));
+
+                    ave_time = launch_and_time_kernel(
+                        stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);
+                }
+            };
+
+            constexpr index_t minimum_occupancy = []() {
+                if constexpr(BlkGemmPipeSched == BlockGemmPipelineScheduler::Interwave)
+                {
+                    return 2;
+                }
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    return (MPerBlock * NPerBlock / BlockSize <= 128) ? 2 : 1;
+                }
+                else
+                {
+                    return 1;
+                }
+            }();
+
+            if(has_main_k_block_loop)
+            {
+                // Tail number always full
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        const auto kernel =
+                            kernel_gemm_wmma_cshuffle_v3<GridwiseGemm,
+                                                         true,
+                                                         InMemoryDataOperationEnum::AtomicAdd,
+                                                         minimum_occupancy>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel =
+                            kernel_gemm_wmma_cshuffle_v3<GridwiseGemm,
+                                                         true,
+                                                         InMemoryDataOperationEnum::Set,
+                                                         minimum_occupancy>;
+                        Run(kernel);
+                    }
+                }
+                else
+                {
+                    // TODO: Implement
+                }
+            }
+            else
+            {
+                // TODO: Implement
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!ck::is_gfx11_supported() && !ck::is_gfx12_supported())
+        {
+            return false;
+        }
+
+        if constexpr(std::is_same_v<CDataType, ck::half_t> ||
+                     std::is_same_v<CDataType, ck::bhalf_t>)
+        {
+            if(arg.KBatch > 1 && ck::is_gfx11_supported())
+            {
+                // gfx11 does not support *_atomic_pk_add_f16/bf16 instructions
+                return false;
+            }
+        }
+
+        if constexpr(std::is_same_v<ADataType, f8_t> || std::is_same_v<ADataType, bf8_t> ||
+                     std::is_same_v<BDataType, f8_t> || std::is_same_v<BDataType, bf8_t>)
+        {
+            if(ck::is_gfx11_supported())
+            {
+                return false;
+            }
+        }
+
+        if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
+                                                       GemmSpec == GemmSpecialization::NKPadding ||
+                                                       GemmSpec == GemmSpecialization::MNKPadding ||
+                                                       GemmSpec == GemmSpecialization::KPadding))
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    index_t GetKPerBlock() override { return KPerBlock; }
+
+    bool GetPermuteA() override { return PermuteA; }
+    bool GetPermuteB() override { return PermuteB; }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const BDataType* p_b,
+                             CDataType* p_c,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t StrideA,
+                             index_t StrideB,
+                             index_t StrideC,
+                             index_t KBatch,
+                             AElementwiseOperation,
+                             BElementwiseOperation,
+                             CElementwiseOperation)
+    {
+        return Argument{p_a, p_b, p_c, M, N, K, StrideA, StrideB, StrideC, KBatch};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      void* p_c,
+                                                      index_t M,
+                                                      index_t N,
+                                                      index_t K,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      index_t StrideC,
+                                                      index_t KBatch,
+                                                      AElementwiseOperation,
+                                                      BElementwiseOperation,
+                                                      CElementwiseOperation) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<CDataType*>(p_c),
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideB,
+                                          StrideC,
+                                          KBatch);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<BlockGemmPipelineScheduler, std::string> BlkGemmPipelineSchedulerToString{
+            {BlockGemmPipelineScheduler::Intrawave, "Intrawave"},
+            {BlockGemmPipelineScheduler::Interwave, "Interwave"}};
+
+        std::map<BlockGemmPipelineVersion, std::string> BlkGemmPipelineVersionToString{
+            {BlockGemmPipelineVersion::v1, "v1"},
+            {BlockGemmPipelineVersion::v2, "v2"},
+            {BlockGemmPipelineVersion::v3, "v3"},
+            {BlockGemmPipelineVersion::v4, "v4"},
+            {BlockGemmPipelineVersion::v5, "v5"}};
+
+        // clang-format off
+        str << "DeviceGemm_Wmma_CShuffleV3"
+            << "<"
+            << getGemmSpecializationString(GemmSpec) << ", "
+            << std::string(ALayout::name)[0]
+            << std::string(BLayout::name)[0]
+            << std::string(CLayout::name)[0]
+            << ">"
+            << " BlkSize: "
+            << BlockSize << ", "
+            << "BlkTile: "
+            << MPerBlock << "x" << NPerBlock << "x" << KPerBlock << ", "
+            << "WaveTile: "
+            << MPerWmma << "x"<<NPerWmma << ", "
+            << "WaveMap: "
+            << MRepeat << "x" << NRepeat << ", "
+            << "VmemReadVec: "
+            << ABlockTransferSrcScalarPerVector << "x" << BBlockTransferSrcScalarPerVector << ", "
+            << "BlkGemmPipelineScheduler: "
+            << BlkGemmPipelineSchedulerToString[BlkGemmPipeSched] << ", "
+            << "BlkGemmPipelineVersion: "
+            << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
+            << "BlkGemmPipelinePrefetchStages: "
+            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages << ", "
+            << "KPack: "
+            << GridwiseGemm::KPack;
+        // clang-format on
+
+        return str.str();
+    }
+    REGISTER_EXTRA_PRINTING_METHODS
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
index a8cf681995..dde21725d0 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -21,6 +21,105 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 
+/// @brief \"Universal\" GEMM operation with SplitK support.
+///
+/// @par Overview
+///         This GEMM operation implements the following mathematical equation:
+///         C{M,N} = C_op(A_op(A{M,K}) * B_op(B{K,N}))
+///         Where A, B are input tensors and C is the output tensor. The A/B/C_op are
+///         elementwise operations applied to the A, B, and C tensors, respectively.
+///         The \"universal\" gemm comes with multiple pipelines optimized for different usage
+///         scenarios. That's why it's called \"universal\". It's universal through it's design
+///         and versatilty.
+///
+/// @note   This Kernel implementation supports SplitK algorithm. It can be configured
+///         to split the dot product accumulated over the K dimension into multiple working groups.
+///         The partial products of different workgroups are then reduced using the AtomicAdd
+///         operation.
+///
+/// @tparam ALayout     A tensor data layout.
+/// @tparam BLayout     B tensor data layout.
+/// @tparam CLayout     C tensor data layout.
+/// @tparam ADataType   A tensor data type.
+/// @tparam BDataType   B tensor data type.
+/// @tparam CDataType   C tensor data type.
+/// @tparam GemmAccDataType The accumulation data type related to the hardware
+///                         matrix-multiplication instruction.
+/// @tparam CShuffleDataType The data type used to store matrix-multiplication results into
+///                          LDS memory during \"CShuffle\" data layout optimization.
+/// @tparam AElementwiseOperation Elementwise operation applied to the A input tensor elements.
+/// @tparam BElementwiseOperation Elementwise operation applied to the B input tensor elements.
+/// @tparam CElementwiseOperation Elementwise operation applied to the C output tensor
+///                               (after GEMM).
+/// @tparam GemmSpec    Determines used "padding" version.
+/// @tparam BlockSize   The number of threads within workgroup.
+/// @tparam MPerBlock   The input/output data tile size in the M dimension.
+/// @tparam NPerBlock   The input/output data tile size in the N dimension.
+/// @tparam KPerBlock   The input data tile size in the K dimension.
+/// @tparam AK1         The vector load size from global memory for A tensor.
+/// @tparam BK1         The vector load size from global memory for B tensor.
+/// @tparam MPerXDL     M size of matrix-fused-multiply-add instruction.
+/// @tparam NPerXDL     N size of matrix-fused-multiply-add instruction.
+/// @tparam MXdlPerWave The number of iterations in the M dimension over output tile per wavefront.
+/// @tparam NXdlPerWave The number of iterations in the N dimension over output tile per wavefront.
+/// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question, "How many threads can be
+///                                                      arranged on each input data axis?"
+/// @tparam ABlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam ABlockTransferSrcVectorDim   The index of axis on which we could do vectorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam ABlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam ABlockTransferDstScalarPerVector_AK1 The size of vectorized store into LDS memory.
+/// @tparam ABlockLdsExtraM                      Whether to use padding for LDS or not. With
+///                                              universal GEMM there's no need for padding.
+/// @tparam BBlockTransferThreadClusterLengths_BK0_N_BK1 Spatial thread distribution over the input
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question: "How many threads to
+///                                                      arrange on each input data axis?"
+/// @tparam BBlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam BBlockTransferSrcAccessOrder he order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam BBlockTransferSrcVectorDim  The index of axis on which we could do vectorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam BBlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam BBlockTransferDstScalarPerVector_BK1 The size of vectorized store into LDS memory.
+/// @tparam BBlockLdsExtraN                      Whether to use padding for LDS or not. With
+///                                              universal GEMM there's no need for padding.
+/// @tparam CShuffleMXdlPerWavePerShuffle   The number of matrix-multiplication instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in M dimension.
+/// @tparam CShuffleNXdlPerWavePerShuffle   The number of matrix-multiplication instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in N dimension.
+/// @tparam CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock The spatial
+///                                         thread distribution used for storing data into output
+///                                         tensor across output data layout dimensions.
+/// @tparam CShuffleBlockTransferScalarPerVector_NPerBlock The size of vectorized memory access.
+///                                         Used when storing data to output tensor.
+/// @tparam BlkGemmPipeSched    The version of blockwise-gemm pipeline scheduler (interwave or
+///                             intrawave).
+/// @tparam BlkGemmPipelineVer  The version of blockwise-gemm pipeline.
+/// @tparam ComputeTypeA    Data type used for A input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam ComputeTypeB    Data type used for B input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam PermuteA            Whether the A input tensor has gridwise-gemm friendly data layout
+///                             in global memory. Currently not supported!
+/// @tparam PermuteB            Whether the B input tensor has gridwise-gemm friendly data layout
+///                             in global memory (pre-shuffled).
 template <typename ALayout,
           typename BLayout,
           typename CLayout,
@@ -130,9 +229,36 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
 
     using Argument = typename GridwiseGemm::Argument;
 
-    // Invoker
+    static constexpr index_t APackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    static constexpr index_t BPackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    /// @brief  Helper structure responsible for kernel invocation.
+    ///
+    /// @paragraph  The `Invoker` class is responsible for preparation and invocation of actual GPU
+    ///             kernel function. It usually determines the launched grid size prepares kernel
+    ///             arguments as well as perform specific kernel configuration selection based on
+    ///             runtime arguments.
+    ///
+    /// @note       If appropriately configured it may measure kernel execution time.
+    ///
     struct Invoker : public BaseInvoker
     {
+        /// @brief  This function issues GPU kernel execution.
+        /// @param arg           The GPU kernel arguments.
+        /// @param stream_config The HIP stream configuration helper structure.
+        /// @return              The kernel's average execution time (if time measurement is
+        ///                      enabled).
         float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
@@ -166,10 +292,10 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
                     const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
                         arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
 
-                    auto size_a_buffer =
-                        a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType);
-                    auto size_b_buffer =
-                        b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
+                    auto size_a_buffer = a_grid_desc_ak0_m_ak1.GetElementSpaceSize() *
+                                         sizeof(ADataType) / APackedSize;
+                    auto size_b_buffer = b_grid_desc_bk0_n_bk1.GetElementSpaceSize() *
+                                         sizeof(BDataType) / BPackedSize;
 
                     ck::utility::RotatingMemWrapper<Argument> rotating_mem(
                         arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
new file mode 100644
index 0000000000..faa235be50
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
@@ -0,0 +1,531 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/flush_cache.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          typename ComputeTypeA                       = CDataType,
+          typename ComputeTypeB                       = ComputeTypeA,
+          bool PermuteA                               = false,
+          bool PermuteB                               = false>
+struct DeviceGemm_Xdl_CShuffleV3_BPreshuffle : public DeviceGemmV2BPreshuffle<ALayout,
+                                                                              BLayout,
+                                                                              CLayout,
+                                                                              ADataType,
+                                                                              BDataType,
+                                                                              CDataType,
+                                                                              AElementwiseOperation,
+                                                                              BElementwiseOperation,
+                                                                              CElementwiseOperation>
+{
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemm_xdl_cshuffle_v3_b_preshuffle<
+        ALayout,
+        BLayout,
+        CLayout,
+        ADataType,
+        BDataType,
+        GemmAccDataType,
+        CShuffleDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        GemmSpec,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        ComputeTypeA,
+        ComputeTypeB,
+        PermuteA,
+        PermuteB>;
+
+    using Argument = typename GridwiseGemm::Argument;
+
+    static constexpr index_t APackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    static constexpr index_t BPackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    int GetPreShuffleParameters() override { return NPerXDL; }
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+                GridwiseGemm::BlockwiseGemmPipe::HotLoopInstList::Print();
+            }
+
+            if(!GridwiseGemm::CheckValidity(arg))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N, arg.KBatch);
+
+            float ave_time = 0;
+
+            index_t k_grain = arg.KBatch * KPerBlock;
+            index_t K_split = (arg.K + k_grain - 1) / k_grain * KPerBlock;
+
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
+
+            const auto Run = [&](const auto& kernel) {
+                if(stream_config.flush_cache)
+                {
+                    Argument arg_ = arg;
+
+                    const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
+                        arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
+                    const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
+                        arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
+
+                    auto size_a_buffer = a_grid_desc_ak0_m_ak1.GetElementSpaceSize() *
+                                         sizeof(ADataType) / APackedSize;
+                    auto size_b_buffer = b_grid_desc_bk0_n_bk1.GetElementSpaceSize() *
+                                         sizeof(BDataType) / BPackedSize;
+
+                    ck::utility::RotatingMemWrapper<Argument> rotating_mem(
+                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
+                    rotating_mem.Print();
+
+                    auto run_flush_cache = [&]() {
+                        // flush icache
+                        ck::utility::flush_icache();
+                        // rotating mem
+                        rotating_mem.Next();
+                        // clear c mem
+                        if(arg_.KBatch > 1)
+                            hipGetErrorString(hipMemsetAsync(arg_.p_c_grid,
+                                                             0,
+                                                             arg_.M * arg_.N * sizeof(CDataType),
+                                                             stream_config.stream_id_));
+                    };
+
+                    ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                        stream_config,
+                        run_flush_cache,
+                        kernel,
+                        dim3(gdx, gdy, gdz),
+                        dim3(BlockSize),
+                        0,
+                        arg_);
+                }
+                else
+                {
+                    if(arg.KBatch > 1)
+                        hipGetErrorString(hipMemsetAsync(arg.p_c_grid,
+                                                         0,
+                                                         arg.M * arg.N * sizeof(CDataType),
+                                                         stream_config.stream_id_));
+
+                    ave_time = launch_and_time_kernel(
+                        stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);
+                }
+            };
+
+            constexpr auto estimated_reg_a = MPerBlock * KPerBlock * sizeof(ADataType) / BlockSize /
+                                             4 * (1 + GridwiseGemm::NWave);
+            constexpr auto estimated_reg_b =
+                NPerBlock * KPerBlock * sizeof(BDataType) / BlockSize / 4 * (2);
+            constexpr auto estimated_reg_c =
+                MPerBlock * NPerBlock * sizeof(GemmAccDataType) / BlockSize / 4;
+            constexpr auto estimated_reg_total =
+                estimated_reg_a + estimated_reg_b + estimated_reg_c;
+
+            constexpr index_t minimum_occupancy = (estimated_reg_total >= 256) ? 1 : 2;
+
+            if(has_main_k_block_loop)
+            {
+                // Tail number always full
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_gemm_xdl_cshuffle_v3_b_preshuffle<
+                                GridwiseGemm,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_gemm_xdl_cshuffle_v3_b_preshuffle<
+                                GridwiseGemm,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_gemm_xdl_cshuffle_v3_b_preshuffle<
+                                GridwiseGemm,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_gemm_xdl_cshuffle_v3_b_preshuffle<
+                                GridwiseGemm,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                }
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2 ||
+                                  BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_gemm_xdl_cshuffle_v3_b_preshuffle_2lds<
+                                GridwiseGemm,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_gemm_xdl_cshuffle_v3_b_preshuffle_2lds<
+                                GridwiseGemm,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_gemm_xdl_cshuffle_v3_b_preshuffle_2lds<
+                                GridwiseGemm,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_gemm_xdl_cshuffle_v3_b_preshuffle_2lds<
+                                GridwiseGemm,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                }
+                else
+                {
+                    throw std::runtime_error("Only support pipeline ver v1, v2, v3 now!");
+                }
+            }
+#if 0
+            else
+            {
+                // Tail number always 1
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        const auto kernel =
+                        kernel_gemm_xdl_cshuffle_v3_b_preshuffle<GridwiseGemm,
+                                                        false,
+                                                        InMemoryDataOperationEnum::AtomicAdd,
+                                                        minimum_occupancy,
+                                                        TailNumber::Odd>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel =
+                        kernel_gemm_xdl_cshuffle_v3_b_preshuffle<GridwiseGemm,
+                                                        false,
+                                                        InMemoryDataOperationEnum::Set,
+                                                        minimum_occupancy,
+                                                        TailNumber::Odd>;
+                        Run(kernel);
+                    }
+                }
+            }
+#endif
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!ck::is_xdl_supported())
+        {
+            return false;
+        }
+
+        if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> && arg.KBatch > 1)
+        {
+            return false;
+        }
+
+        if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
+                                                       GemmSpec == GemmSpecialization::NKPadding ||
+                                                       GemmSpec == GemmSpecialization::MNKPadding ||
+                                                       GemmSpec == GemmSpecialization::KPadding))
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    index_t GetKPerBlock() override { return KPerBlock; }
+
+    bool GetPermuteA() override { return PermuteA; }
+    bool GetPermuteB() override { return PermuteB; }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const BDataType* p_b,
+                             CDataType* p_c,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t StrideA,
+                             index_t StrideB,
+                             index_t StrideC,
+                             index_t KBatch,
+                             AElementwiseOperation,
+                             BElementwiseOperation,
+                             CElementwiseOperation)
+    {
+        return Argument{p_a, p_b, p_c, M, N, K, StrideA, StrideB, StrideC, KBatch};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      void* p_c,
+                                                      index_t M,
+                                                      index_t N,
+                                                      index_t K,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      index_t StrideC,
+                                                      index_t KBatch,
+                                                      AElementwiseOperation,
+                                                      BElementwiseOperation,
+                                                      CElementwiseOperation) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<CDataType*>(p_c),
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideB,
+                                          StrideC,
+                                          KBatch);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<BlockGemmPipelineScheduler, std::string> BlkGemmPipelineSchedulerToString{
+            {BlockGemmPipelineScheduler::Intrawave, "Intrawave"},
+            {BlockGemmPipelineScheduler::Interwave, "Interwave"}};
+
+        std::map<BlockGemmPipelineVersion, std::string> BlkGemmPipelineVersionToString{
+            {BlockGemmPipelineVersion::v1, "v1"},
+            {BlockGemmPipelineVersion::v2, "v2"},
+            {BlockGemmPipelineVersion::v3, "v3"},
+            {BlockGemmPipelineVersion::v4, "v4"},
+            {BlockGemmPipelineVersion::v5, "v5"}};
+
+        // clang-format off
+        str << "DeviceGemmXdlUniversal"
+            << "<"
+            << getGemmSpecializationString(GemmSpec) << ", "
+            << std::string(ALayout::name)[0]
+            << std::string(BLayout::name)[0]
+            << std::string(CLayout::name)[0]
+            << ">"
+            << " BlkSize: "
+            << BlockSize << ", "
+            << "BlkTile: "
+            << MPerBlock<<"x"<<NPerBlock<<"x"<<KPerBlock << ", "
+            << "WaveTile: "
+            << MPerXDL<<"x"<<NPerXDL << ", "
+            << "WaveMap: "
+            << MXdlPerWave<<"x" << NXdlPerWave<<", "
+            << "VmemReadVec: "
+            << ABlockTransferSrcScalarPerVector<<"x"<<BBlockTransferSrcScalarPerVector<<", "
+            << "BlkGemmPipelineScheduler: "
+            << BlkGemmPipelineSchedulerToString[BlkGemmPipeSched] << ", "
+            << "BlkGemmPipelineVersion: "
+            << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
+            << "BlkGemmPipelinePrefetchStages: "
+            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages << ", "
+            << "Kpack: "
+            << GridwiseGemm::BlockwiseGemmPipe::AMmaKStride;
+        // clang-format on
+
+        return str.str();
+    }
+    REGISTER_EXTRA_PRINTING_METHODS
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp
index 044350d11c..456e5e90d1 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp
@@ -139,6 +139,20 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2BScale<ALayout,
 
     using Argument = typename GridwiseGemm::Argument;
 
+    static constexpr index_t APackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    static constexpr index_t BPackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
     // Invoker
     struct Invoker : public BaseInvoker
     {
@@ -174,10 +188,10 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2BScale<ALayout,
                     const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
                         arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
 
-                    auto size_a_buffer =
-                        a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType);
-                    auto size_b_buffer =
-                        b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
+                    auto size_a_buffer = a_grid_desc_ak0_m_ak1.GetElementSpaceSize() *
+                                         sizeof(ADataType) / APackedSize;
+                    auto size_b_buffer = b_grid_desc_bk0_n_bk1.GetElementSpaceSize() *
+                                         sizeof(BDataType) / BPackedSize;
 
                     ck::utility::RotatingMemWrapper<Argument> rotating_mem(
                         arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
new file mode 100644
index 0000000000..c37af49387
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
@@ -0,0 +1,880 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+
+#include "ck/host_utility/flush_cache.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_mx.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// clang-format off
+/**
+ * \brief WIP: Implements XDL CShuffle V3 GEMM for microscale-compliant data types
+ *
+ * This class is a work-in-progress implementation of the XDL CShuffle V3 GEMM for
+ * microscale-compliant data types.
+ *
+ * Assumptions:
+ * - A and B data types are compliant with the OCP Microscaling Formats (MX) Specification
+ * - Each scale applies to ScaleBlockSize elements in K direction
+ * - A scale matrix is a row-major
+ * - B scale matrix is a column-major
+ * - Scale data types must have get_exponent_value() specialization, whereas lowest 8 bits of the
+ * exponent will be interpreted as conventional biased Float32 exponent (E8M0)
+ *
+ * Tunable parameters.
+ * The CK instance includes a series of tunable template parameters to control the parallel
+ * granularity of the workload to achieve load balancing on different hardware platforms. These
+ * parameters include Block Size, M/N/K Per Block, M/N per XDL, AK1, BK1, etc.
+ *  - Block Size determines the number of threads in the thread block.
+ *  - M/N/K Per Block determines the size of tile that each thread block is responsible for
+ * calculating.
+ *  - M/N Per XDL refers to M/N size for Instinct accelerator Matrix Fused Multiply Add (MFMA)
+ * instructions operating on a per-wavefront basis.
+ *  - A/B K1 is related to the data type. It can be any value ranging from 1 to K Per Block. To
+ * achieve the optimal load/store performance, 128bit per load is suggested. In addition, the A/B
+ * loading parameters must be changed accordingly to match the A/B K1 value; otherwise, it will
+ * result in compilation errors.
+ *
+ * Conditions for achieving computational load balancing on different hardware platforms can vary.
+ *
+ * Serialized version of the algorithm:
+ * \code
+ * // E = A * B + C
+ * // Loop over E[MPerBlock,NPerBlock] tiles
+ * for(int mb = 0; mb < M; mb += MPerBlock){
+ *    for(int nb = 0; nb < N; nb += NPerBlock){
+ *       // initialize E[MPerBlock,NPerBlock] tile
+ *       for(int mt = mb; mt < mb + MPerBlock; mt++){
+ *          for(int nt = nb; nt < nb + NPerBlock; nt++){
+ *             E[mt,nt] = C[mt,nt];
+ *          }
+ *       }
+ *
+ *       // multiply-accumulate per tile
+ *       for(int kb = 0; kb < K; kb += KPerBlock){
+ *         for(int m0 = mb; m0 < mb + MPerBlock; m0 += MWaves * MPerXDL){
+ *           for(int n0 = nb; n0 < nb + NPerBlock; n0 += NWaves * NPerXDL){
+ *             for(int mw = m0; mw < m0 + MWaves * MPerXDL; mw += MPerXDL){
+ *               for(int nw = n0; nw < n0 + NWaves * NPerXDL; nw += NPerXDL){
+ *                 for(int k0 = kb; k0 < kb + KPerBlock; k0 += mfma.num_input_blks*KPack){
+ *                   // MFMA accumulation
+ *                   for(int k_pack = k0; k_pack < k0 + mfma.num_input_blks*KPack; k_pack += KPerXdlops){
+ *                     // MFMA instruction
+ *                     for(int k_mfma = k_pack; k_mfma < k_pack + KPerXdlops; k_mfma += mfma.k_per_blk){
+ *                       for(int m = mw; m < mw + MPerXDL; m++){
+ *                         for(int n = nw; n < nw + NPerXDL; n++){
+ *                           for(int k = k_mfma; k < k_mfma + mfma.k_per_blk; k++){
+ *                            E[m,n] += A[m,k] * B[k,n];
+ *                           }
+ *                         }
+ *                       }
+ *                     }
+ *                   }
+ *                 }
+ *               }
+ *             }
+ *           }
+ *         }
+ *       }
+ *    }
+ * }
+ * \endcode
+ *
+ */
+// clang-format on
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename CDataType,
+          typename GemmAccDataType, // TODO: always float
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t ScaleBlockSize, // Scaling block size
+          index_t BlockSize,      // Thread block size
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          typename ComputeTypeA =
+              ADataType, // XXX: These should always be the same as ADataType and BDataType
+          typename ComputeTypeB =
+              BDataType // TODO: Hardcode them and remove from the list of template parameters
+          >
+struct DeviceGemmMX_Xdl_CShuffleV3 : public DeviceGemmMX<ALayout,
+                                                         BLayout,
+                                                         CLayout,
+                                                         ADataType,
+                                                         AScaleDataType,
+                                                         BDataType,
+                                                         BScaleDataType,
+                                                         CDataType,
+                                                         ScaleBlockSize,
+                                                         AElementwiseOperation,
+                                                         BElementwiseOperation,
+                                                         CElementwiseOperation>
+{
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmMX_xdl_cshuffle_v3<
+        ALayout,
+        BLayout,
+        CLayout,
+        ADataType,
+        AScaleDataType,
+        BDataType,
+        BScaleDataType,
+        GemmAccDataType,
+        CShuffleDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        GemmSpec,
+        ScaleBlockSize,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        ComputeTypeA,
+        ComputeTypeB>;
+
+    using Argument = typename GridwiseGemm::Argument;
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+                GridwiseGemm::BlockwiseGemmPipe::HotLoopInstList::Print();
+            }
+
+            if(!GridwiseGemm::CheckValidity(arg))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N, arg.KBatch);
+
+            float ave_time = 0;
+
+            index_t k_grain = arg.KBatch * KPerBlock;
+            index_t K_split = (arg.K + k_grain - 1) / k_grain * KPerBlock;
+
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
+
+            const auto Run = [&](const auto& kernel) {
+                if(stream_config.flush_cache)
+                {
+                    Argument arg_ = arg;
+
+                    const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
+                        arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
+                    const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
+                        arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
+
+                    auto size_a_buffer =
+                        a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType);
+                    auto size_b_buffer =
+                        b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
+
+                    ck::utility::RotatingMemWrapper<Argument> rotating_mem(
+                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
+                    rotating_mem.Print();
+
+                    auto run_flush_cache = [&]() {
+                        // flush icache
+                        ck::utility::flush_icache();
+                        // rotating mem
+                        rotating_mem.Next();
+                        // clear c mem
+                        if(arg_.KBatch > 1)
+                            hipGetErrorString(hipMemsetAsync(arg_.p_c_grid,
+                                                             0,
+                                                             arg_.M * arg_.N * sizeof(CDataType),
+                                                             stream_config.stream_id_));
+                    };
+
+                    ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                        stream_config,
+                        run_flush_cache,
+                        kernel,
+                        dim3(gdx, gdy, gdz),
+                        dim3(BlockSize),
+                        0,
+                        arg_);
+                }
+                else
+                {
+                    if(arg.KBatch > 1)
+                        hipGetErrorString(hipMemsetAsync(arg.p_c_grid,
+                                                         0,
+                                                         arg.M * arg.N * sizeof(CDataType),
+                                                         stream_config.stream_id_));
+
+                    ave_time = launch_and_time_kernel(
+                        stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);
+                }
+            };
+
+            // TODO: Check if this is the right algorithm for minimum_occupancy
+            constexpr index_t minimum_occupancy =
+                BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave
+                    ? (BlkGemmPipelineVer == BlockGemmPipelineVersion::v3 &&
+                       MPerBlock * NPerBlock * KPerBlock * sizeof(ADataType) <= 128 * 128 * 64 * 2)
+                          ? 2
+                          : 1
+                    : 2;
+
+            if(has_main_k_block_loop)
+            {
+                // Tail number always full
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1 ||
+                             BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                        true,
+                                                        InMemoryDataOperationEnum::AtomicAdd,
+                                                        minimum_occupancy>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                        true,
+                                                        InMemoryDataOperationEnum::Set,
+                                                        minimum_occupancy>;
+                        Run(kernel);
+                    }
+                }
+                // Tail number could be One to Seven
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::AtomicAdd,
+                                                            minimum_occupancy,
+                                                            TailNumber::One>;
+                            Run(kernel);
+                        }
+                        else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                TailNumber::Full)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::AtomicAdd,
+                                                            minimum_occupancy,
+                                                            TailNumber::Full>;
+                            Run(kernel);
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two)
+                            {
+                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Two>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Three)
+                            {
+                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Three>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Four)
+                            {
+                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Four>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Five)
+                            {
+                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Five>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six)
+                            {
+                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Six>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Seven)
+                            {
+                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Seven>;
+                                Run(kernel);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::Set,
+                                                            minimum_occupancy,
+                                                            TailNumber::One>;
+                            Run(kernel);
+                        }
+                        else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                TailNumber::Full)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::Set,
+                                                            minimum_occupancy,
+                                                            TailNumber::Full>;
+                            Run(kernel);
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two)
+                            {
+                                const auto kernel =
+                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                true,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy,
+                                                                TailNumber::Two>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Three)
+                            {
+                                const auto kernel =
+                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                true,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy,
+                                                                TailNumber::Three>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Four)
+                            {
+                                const auto kernel =
+                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                true,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy,
+                                                                TailNumber::Four>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Five)
+                            {
+                                const auto kernel =
+                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                true,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy,
+                                                                TailNumber::Five>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six)
+                            {
+                                const auto kernel =
+                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                true,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy,
+                                                                TailNumber::Six>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Seven)
+                            {
+                                const auto kernel =
+                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                true,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy,
+                                                                TailNumber::Seven>;
+                                Run(kernel);
+                            }
+                        }
+                    }
+                }
+                // Tail number could be Odd or Even
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_gemm_xdl_cshuffle_v3_2lds<
+                                GridwiseGemm,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_gemm_xdl_cshuffle_v3_2lds<
+                                GridwiseGemm,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3_2lds<GridwiseGemm,
+                                                                 true,
+                                                                 InMemoryDataOperationEnum::Set,
+                                                                 minimum_occupancy,
+                                                                 TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3_2lds<GridwiseGemm,
+                                                                 true,
+                                                                 InMemoryDataOperationEnum::Set,
+                                                                 minimum_occupancy,
+                                                                 TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                }
+                else
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::AtomicAdd,
+                                                            minimum_occupancy,
+                                                            TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::AtomicAdd,
+                                                            minimum_occupancy,
+                                                            TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::Set,
+                                                            minimum_occupancy,
+                                                            TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::Set,
+                                                            minimum_occupancy,
+                                                            TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                }
+            }
+            else
+            {
+                // Tail number always 1
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                        false,
+                                                        InMemoryDataOperationEnum::AtomicAdd,
+                                                        minimum_occupancy>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                        false,
+                                                        InMemoryDataOperationEnum::Set,
+                                                        minimum_occupancy>;
+                        Run(kernel);
+                    }
+                }
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        static_assert(is_scale_mfma_data_type<ADataType>() && is_scale_mfma_data_type<BDataType>(),
+                      "Only microscaling formats are supported for ADataType and BDataType");
+
+        static_assert(ScaleBlockSize == 32, "Only ScaleBlockSize 32 is supported");
+
+        static_assert(is_same_v<ComputeTypeA, ADataType> && is_same_v<ComputeTypeB, BDataType>,
+                      "ComputeTypeA and ComputeTypeB must be the same as ADataType and BDataType");
+
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if constexpr(!IsValidCompilationParameter())
+        {
+            return false;
+        }
+
+        if(!ck::is_xdl_supported())
+        {
+            return false;
+        }
+
+        if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> && arg.KBatch > 1)
+        {
+            return false;
+        }
+
+        if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
+                                                       GemmSpec == GemmSpecialization::NKPadding ||
+                                                       GemmSpec == GemmSpecialization::MNKPadding ||
+                                                       GemmSpec == GemmSpecialization::KPadding))
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const AScaleDataType* p_a_scale,
+                             const BDataType* p_b,
+                             const BScaleDataType* p_b_scale,
+                             CDataType* p_c,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t StrideA,
+                             index_t StrideScaleA,
+                             index_t StrideB,
+                             index_t StrideScaleB,
+                             index_t StrideC,
+                             index_t KBatch,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{p_a,
+                        p_a_scale,
+                        p_b,
+                        p_b_scale,
+                        p_c,
+                        M,
+                        N,
+                        K,
+                        StrideA,
+                        StrideScaleA,
+                        StrideB,
+                        StrideScaleB,
+                        StrideC,
+                        KBatch,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_a_scale,
+                                                      const void* p_b,
+                                                      const void* p_b_scale,
+                                                      void* p_c,
+                                                      ck::index_t M,
+                                                      ck::index_t N,
+                                                      ck::index_t K,
+                                                      ck::index_t StrideA,
+                                                      ck::index_t StrideScaleA,
+                                                      ck::index_t StrideB,
+                                                      ck::index_t StrideScaleB,
+                                                      ck::index_t StrideC,
+                                                      ck::index_t KBatch,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const AScaleDataType*>(p_a_scale),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<const BScaleDataType*>(p_b_scale),
+                                          static_cast<CDataType*>(p_c),
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideScaleA,
+                                          StrideB,
+                                          StrideScaleB,
+                                          StrideC,
+                                          KBatch,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<BlockGemmPipelineScheduler, std::string> BlkGemmPipelineSchedulerToString{
+            {BlockGemmPipelineScheduler::Intrawave, "Intrawave"},
+            {BlockGemmPipelineScheduler::Interwave, "Interwave"}};
+
+        std::map<BlockGemmPipelineVersion, std::string> BlkGemmPipelineVersionToString{
+            {BlockGemmPipelineVersion::v1, "v1"},
+            {BlockGemmPipelineVersion::v2, "v2"},
+            {BlockGemmPipelineVersion::v3, "v3"},
+            {BlockGemmPipelineVersion::v4, "v4"},
+            {BlockGemmPipelineVersion::v5, "v5"}};
+
+        // clang-format off
+        str << "DeviceGemmMX_Xdl_CShuffleV3"
+            << "<"
+            << getGemmSpecializationString(GemmSpec) << ", "
+            << std::string(ALayout::name)[0]
+            << std::string(BLayout::name)[0]
+            << std::string(CLayout::name)[0]
+            << ">"
+            << " BlkSize: "
+            << BlockSize << ", "
+            << "BlkTile: "
+            << MPerBlock<<"x"<<NPerBlock<<"x"<<KPerBlock << ", "
+            << "WaveTile: "
+            << MPerXDL<<"x"<<NPerXDL << ", "
+            << "WaveMap: "
+            << MXdlPerWave<<"x" << NXdlPerWave<<", "
+            << "VmemReadVec: "
+            << ABlockTransferSrcScalarPerVector<<"x"<<BBlockTransferSrcScalarPerVector<<", "
+            << "BlkGemmPipelineScheduler: "
+            << BlkGemmPipelineSchedulerToString[BlkGemmPipeSched] << ", "
+            << "BlkGemmPipelineVersion: "
+            << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
+            << "BlkGemmPipelinePrefetchStages: "
+            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages << ", "
+            << "Kpack: "
+            << GridwiseGemm::BlockwiseGemmPipe::AMmaKStride << ", "
+            << "ScaleBlockSize: "
+            << ScaleBlockSize;
+        // clang-format on
+
+        return str.str();
+    }
+    REGISTER_EXTRA_PRINTING_METHODS
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp
index fd6f3b65f2..213501468a 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -7,6 +7,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp
index c2a27ebbdb..7315fe75a3 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -7,6 +7,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
index 359711e5c4..5e41c96dfc 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -227,7 +227,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
                  const std::array<index_t, NDimSpatial>& input_right_pads,
                  const AElementwiseOp& a_element_op,
                  const BElementwiseOp& b_element_op,
-                 const CDEElementwiseOp& cde_element_op)
+                 const CDEElementwiseOp& cde_element_op,
+                 const ck::index_t split_k = 1)
             : p_a_grid_{static_cast<const ADataType*>(p_a)},
               p_b_grid_{static_cast<const BDataType*>(p_b)},
               p_ds_grid_{},
@@ -240,7 +241,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
               b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths},
               conv_filter_strides_{conv_filter_strides},
               input_left_pads_{input_left_pads},
-              input_right_pads_{input_right_pads}
+              input_right_pads_{input_right_pads},
+              k_batch_{split_k}
         {
             // populate Ds pointer
             static_for<0, NumDTensor, 1>{}([&](auto i) {
@@ -445,6 +447,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
         std::array<index_t, NDimSpatial> conv_filter_strides_;
         std::array<index_t, NDimSpatial> input_left_pads_;
         std::array<index_t, NDimSpatial> input_right_pads_;
+
+        const index_t k_batch_;
     };
 
     // Invoker
@@ -534,6 +538,11 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
+        if(arg.k_batch_ != 1)
+        {
+            return false;
+        }
+
         // check device
         if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
         {
@@ -691,7 +700,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
                  const std::array<index_t, NDimSpatial>& input_right_pads,
                  const AElementwiseOp& a_element_op,
                  const BElementwiseOp& b_element_op,
-                 const CDEElementwiseOp& cde_element_op)
+                 const CDEElementwiseOp& cde_element_op,
+                 const ck::index_t split_k = 1)
     {
         return Argument{p_a,
                         p_b,
@@ -711,7 +721,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
                         input_right_pads,
                         a_element_op,
                         b_element_op,
-                        cde_element_op};
+                        cde_element_op,
+                        split_k};
     }
 
     static auto MakeInvoker() { return Invoker{}; }
@@ -737,7 +748,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
         const std::array<index_t, NDimSpatial>& input_right_pads,
         const AElementwiseOp& a_element_op,
         const BElementwiseOp& b_element_op,
-        const CDEElementwiseOp& cde_element_op) override
+        const CDEElementwiseOp& cde_element_op,
+        const ck::index_t split_k = 1) override
     {
         return std::make_unique<Argument>(p_a,
                                           p_b,
@@ -757,7 +769,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
                                           input_right_pads,
                                           a_element_op,
                                           b_element_op,
-                                          cde_element_op);
+                                          cde_element_op,
+                                          split_k);
     }
 
     std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
index 99bd3be15d..3028cd7cbc 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -1,22 +1,29 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
 #include <iostream>
 #include <sstream>
 
+#include "ck/library/utility/numeric.hpp"
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp"
 #include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp"
 #include "ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/operator_transform/transform_conv_ngchw_to_nhwgc.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/flush_cache.hpp"
 #include "ck/host_utility/io.hpp"
 
 namespace ck {
@@ -147,6 +154,153 @@ __global__ void
 #endif
 }
 
+template <typename GridwiseGemm,
+          typename AGridDesc_AK0_M_K1,
+          typename BGridDesc_BK0_N_K1,
+          typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename ComputePtrOffsetOfBatch,
+          typename ComputePtrOffsetOfN,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Full>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+    // __attribute__((amdgpu_waves_per_eu(1, 1)))
+    kernel_grouped_conv_bwd_data_xdl_cshuffle_v3(
+        typename GridwiseGemm::Argument karg,
+        const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
+        const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
+        const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock,
+        const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+        const ComputePtrOffsetOfN compute_ptr_offset_of_n,
+        const index_t num_k_per_block)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
+    // offset base pointer for each work-group
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
+    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / karg.KBatch);
+    const index_t k_idx =
+        __builtin_amdgcn_readfirstlane((blockIdx.y - n_idx * karg.KBatch) * num_k_per_block);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+    const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
+
+    const long_index_t a_n_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
+    const long_index_t e_n_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));
+
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run<AGridDesc_AK0_M_K1,
+                               BGridDesc_BK0_N_K1,
+                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                               HasMainKBlockLoop,
+                               CGlobalMemoryDataOperation,
+                               TailNum>(karg.p_a_grid + a_batch_offset + a_n_offset,
+                                        karg.p_b_grid + b_batch_offset,
+                                        karg.p_c_grid + e_batch_offset + e_n_offset,
+                                        p_shared,
+                                        karg,
+                                        a_grid_desc_ak0_m_ak1,
+                                        b_grid_desc_bk0_n_bk1,
+                                        c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                        k_idx);
+#else
+    ignore = karg;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = compute_ptr_offset_of_batch;
+    ignore = compute_ptr_offset_of_n;
+    ignore = num_k_per_block;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+template <typename GridwiseGemm,
+          typename AGridDesc_AK0_M_K1,
+          typename BGridDesc_BK0_N_K1,
+          typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename ComputePtrOffsetOfBatch,
+          typename ComputePtrOffsetOfN,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Full>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+    // __attribute__((amdgpu_waves_per_eu(1, 1)))
+    kernel_grouped_conv_bwd_data_xdl_cshuffle_v3_2lds(
+        typename GridwiseGemm::Argument karg,
+        const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
+        const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
+        const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock,
+        const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+        const ComputePtrOffsetOfN compute_ptr_offset_of_n,
+        const index_t num_k_per_block)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
+    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / karg.KBatch);
+    const index_t k_idx =
+        __builtin_amdgcn_readfirstlane((blockIdx.y - n_idx * karg.KBatch) * num_k_per_block);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+    const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
+
+    const long_index_t a_n_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
+    const long_index_t e_n_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));
+
+    // Pass two lds pointer is the key to tell compiler that ds_read/write
+    // operate on different lds chunk at same time without order dependecy
+    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run_2Lds<AGridDesc_AK0_M_K1,
+                                    BGridDesc_BK0_N_K1,
+                                    CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                    HasMainKBlockLoop,
+                                    CGlobalMemoryDataOperation,
+                                    TailNum>(karg.p_a_grid + a_batch_offset + a_n_offset,
+                                             karg.p_b_grid + b_batch_offset,
+                                             karg.p_c_grid + e_batch_offset + e_n_offset,
+                                             p_shared_0,
+                                             p_shared_1,
+                                             karg,
+                                             a_grid_desc_ak0_m_ak1,
+                                             b_grid_desc_bk0_n_bk1,
+                                             c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                             k_idx);
+#else
+    ignore = karg;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = compute_ptr_offset_of_batch;
+    ignore = compute_ptr_offset_of_n;
+    ignore = num_k_per_block;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
 } // namespace
 
 // Conv backward data multiple D:
@@ -202,9 +356,13 @@ template <index_t NDimSpatial,
           index_t CShuffleNXdlPerWavePerShuffle,
           typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
           index_t CDEBlockTransferScalarPerVector_NPerBlock,
-          LoopScheduler LoopSched = make_default_loop_scheduler(),
-          typename AComputeType   = ADataType,
-          typename BComputeType   = AComputeType>
+          LoopScheduler LoopSched                        = make_default_loop_scheduler(),
+          typename AComputeType                          = ADataType,
+          typename BComputeType                          = AComputeType,
+          index_t MaxTransposeTransferInScalarPerVector  = 1,
+          index_t MaxTransposeTransferOutScalarPerVector = 1,
+          BlockGemmPipelineScheduler BlkGemmPipeSched    = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer    = BlockGemmPipelineVersion::v1>
 struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
     : public DeviceGroupedConvBwdDataMultipleD<NDimSpatial,
                                                ALayout,    // output image
@@ -227,7 +385,12 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
     using DeviceOp = DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1;
 
-    static constexpr index_t NumDTensor = DsDataType::Size();
+    static constexpr index_t NumDTensor          = DsDataType::Size();
+    static constexpr bool isMultiD               = NumDTensor > 0;
+    static constexpr GemmSpecialization GemmSpec = GemmSpecialization::MNKPadding;
+    static constexpr bool IsSplitKSupported =
+        (CDEBlockTransferScalarPerVector_NPerBlock % 2 == 0 || sizeof(EDataType) % 4 == 0) &&
+        std::is_same_v<remove_cvref_t<CDEElementwiseOp>, element_wise::PassThrough>;
 
     // TODO: Add support for different A and B data types.
     using ABDataType = ADataType;
@@ -237,6 +400,25 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
     static constexpr auto I2 = Number<2>{};
     static constexpr auto I3 = Number<3>{};
 
+    using ALayoutAfterTranspose =
+        std::conditional_t<is_NGCHW_NGKHW<ELayout, BLayout, ALayout>(),
+                           tensor_layout::convolution::NHWGK,
+                           std::conditional_t<is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>(),
+                                              tensor_layout::convolution::NDHWGK,
+                                              ALayout>>;
+    using BLayoutAfterTranspose =
+        std::conditional_t<is_NGCHW_GKCYX_NGKHW<ELayout, BLayout, ALayout>(),
+                           tensor_layout::convolution::GKYXC,
+                           std::conditional_t<is_NGCDHW_GKCZYX_NGKDHW<ELayout, BLayout, ALayout>(),
+                                              tensor_layout::convolution::GKZYXC,
+                                              BLayout>>;
+    using ELayoutAfterTranspose =
+        std::conditional_t<is_NGCHW_NGKHW<ELayout, BLayout, ALayout>(),
+                           tensor_layout::convolution::NHWGC,
+                           std::conditional_t<is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>(),
+                                              tensor_layout::convolution::NDHWGC,
+                                              ELayout>>;
+
     using ConvToGemmBwdDataTransform = TransformConvBwdDataToGemm_v1<NDimSpatial,
                                                                      ConvBackwardDataSpecialization,
                                                                      AK1,
@@ -246,9 +428,9 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                                                                      KPerBlock,
                                                                      DoPadGemmM,
                                                                      DoPadGemmN,
-                                                                     ALayout,
-                                                                     BLayout,
-                                                                     ELayout,
+                                                                     ALayoutAfterTranspose,
+                                                                     BLayoutAfterTranspose,
+                                                                     ELayoutAfterTranspose,
                                                                      true, /*SplitConvN*/
                                                                      ABDataType,
                                                                      EDataType>;
@@ -274,7 +456,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                                                   KPerBlock,
                                                   DoPadGemmM,
                                                   DoPadGemmN,
-                                                  ALayout,
+                                                  ALayoutAfterTranspose,
                                                   BLayout,
                                                   DLayout,
                                                   true, /*SplitConvN*/
@@ -290,53 +472,63 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
             a_grid_desc_ak0_m_ak1, b_grid_desc_bk0_n_bk1, ds_grid_desc_m_n, e_grid_desc_m_n);
     }
 
-    // GridwiseGemm
-    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
-        ABDataType,
-        ABDataType,
-        AComputeType,
-        AccDataType,
-        CShuffleDataType,
-        DsDataType,
-        EDataType,
-        AElementwiseOp,
-        BElementwiseOp,
-        CDEElementwiseOp,
-        InMemoryDataOperationEnum::Set,
-        NumGemmKPrefetchStage,
-        BlockSize,
-        MPerBlock,
-        NPerBlock,
-        KPerBlock,
-        AK1,
-        BK1,
-        MPerXDL,
-        NPerXDL,
-        MXdlPerWave,
-        NXdlPerWave,
-        ABlockTransferThreadClusterLengths_AK0_M_AK1,
-        ABlockTransferThreadClusterArrangeOrder,
-        ABlockTransferSrcAccessOrder,
-        ABlockTransferSrcVectorDim,
-        ABlockTransferSrcScalarPerVector,
-        ABlockTransferDstScalarPerVector_AK1,
-        false,
-        ABlockLdsExtraM,
-        BBlockTransferThreadClusterLengths_BK0_N_BK1,
-        BBlockTransferThreadClusterArrangeOrder,
-        BBlockTransferSrcAccessOrder,
-        BBlockTransferSrcVectorDim,
-        BBlockTransferSrcScalarPerVector,
-        BBlockTransferDstScalarPerVector_BK1,
-        false,
-        BBlockLdsExtraN,
-        CShuffleMXdlPerWavePerShuffle,
-        CShuffleNXdlPerWavePerShuffle,
-        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-        CDEBlockTransferScalarPerVector_NPerBlock,
-        LoopSched,
-        PipelineVersion::v1,
-        BComputeType>;
+// GridwiseGemm
+#define GridwiseGemmMultiDTemplateParams                                                        \
+    ABDataType, ABDataType, AComputeType, AccDataType, CShuffleDataType, DsDataType, EDataType, \
+        AElementwiseOp, BElementwiseOp, CDEElementwiseOp, InMemoryDataOperationEnum::Set,       \
+        NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL,   \
+        NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1,        \
+        ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder,                  \
+        ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector,                           \
+        ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM,                           \
+        BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder,  \
+        BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim,                               \
+        BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false,          \
+        BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle,          \
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,                       \
+        CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVersion::v1, BComputeType
+
+#define GridwiseGemmTemplateParams                                                               \
+    tensor_layout::gemm::RowMajor, tensor_layout::gemm::RowMajor, tensor_layout::gemm::RowMajor, \
+        ADataType, BDataType, AccDataType, CShuffleDataType, EDataType, AElementwiseOp,          \
+        BElementwiseOp, CDEElementwiseOp, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock,  \
+        AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave,                                    \
+        ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder,   \
+        ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim,                                \
+        ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false,           \
+        ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1,                           \
+        BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder,                   \
+        BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector,                            \
+        BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN,                            \
+        CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle,                            \
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,                        \
+        CDEBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer,         \
+        AComputeType, BComputeType
+
+    using GridwiseGemm =
+        std::conditional_t<isMultiD,
+                           GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmMultiDTemplateParams>,
+                           GridwiseGemm_xdl_cshuffle_v3<GridwiseGemmTemplateParams>>;
+
+    template <typename EGridDesc_M_N>
+    static auto
+    MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const EGridDesc_M_N e_grid_desc_m_n)
+    {
+        if constexpr(isMultiD)
+        {
+            return GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                e_grid_desc_m_n);
+        }
+        else
+        {
+            const index_t M = e_grid_desc_m_n.GetLength(I0);
+            const index_t N = e_grid_desc_m_n.GetLength(I1);
+            return GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                e_grid_desc_m_n,
+                GridwiseGemm::CalculateMBlock(M),
+                GridwiseGemm::CalculateNBlock(N));
+        }
+    }
 
     template <typename Desc_K0_M_K1>
     static auto transform_k0_m_k1_to_m_k(const Desc_K0_M_K1& desc_k0_m_k1)
@@ -365,16 +557,104 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
     using BGridDesc_N_K = decltype(transform_k0_m_k1_to_m_k(BGridDesc_BK0_N_BK1{}));
 
     using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        decltype(GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-            DsGridDesc_M_N{}));
+        decltype(GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmMultiDTemplateParams>::
+                     MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DsGridDesc_M_N{}));
     using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        decltype(GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-            EGridDesc_M_N{}));
+        decltype(MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}));
 
     // block-to-e-tile map
-    using Block2ETileMap =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
+    using Block2ETileMap = remove_cvref_t<
+        decltype(GridwiseGemmMultipleD_xdl_cshuffle<
+                 GridwiseGemmMultiDTemplateParams>::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
+    using Block2TileMapInOutElementwise = BlockToCTileMap_M00_N0_M01Adapt<NPerBlock, MPerBlock>;
+    using Block2TileMapWeiElementwise   = BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock>;
 
+    static constexpr index_t ClusterLengthMPerBlock =
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock::At(1);
+    static constexpr index_t ClusterLengthNPerBlock =
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock::At(3);
+
+    static constexpr auto conv_ngchw_to_nhwgc_transformer =
+        TransformConvNGCHWToNHWGC<ELayout,
+                                  BLayout,
+                                  ALayout,
+                                  NDimSpatial,
+                                  NPerBlock / ClusterLengthNPerBlock,
+                                  MPerBlock / ClusterLengthMPerBlock>{};
+
+    static constexpr index_t TransposeTransferInScalarPerVectorAligned =
+        std::min(MPerBlock / ClusterLengthMPerBlock, MaxTransposeTransferInScalarPerVector);
+    static constexpr index_t TransposeTransferOutScalarPerVectorAligned =
+        std::min(MPerBlock / ClusterLengthMPerBlock, MaxTransposeTransferOutScalarPerVector);
+
+    using NGCHWTransposeDescType =
+        remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
+                                    .template MakeNGCHWTransposeDesc<NDimSpatial>({}, {}))>;
+    using NHWGCTransposeDescType =
+        remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
+                                    .template MakeNHWGCTransposeDesc<NDimSpatial>({}, {}))>;
+    using GKCYXTransposeDescType =
+        remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
+                                    .template MakeGKCYXTransposeDesc<NDimSpatial>({}, {}))>;
+    using GKYXCTransposeDescType =
+        remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
+                                    .template MakeGKYXCTransposeDesc<NDimSpatial>({}, {}))>;
+
+    static constexpr index_t ElementwiseBlocksize = ClusterLengthMPerBlock * ClusterLengthNPerBlock;
+
+    using GridwiseElementwiseInputTranspose =
+        GridwiseElementwise<Tuple<NGCHWTransposeDescType>,
+                            Tuple<NHWGCTransposeDescType>,
+                            Tuple<const ADataType*>,
+                            Tuple<ADataType*>,
+                            Block2TileMapInOutElementwise,
+                            element_wise::PassThrough,
+                            ElementwiseBlocksize,
+                            NPerBlock,
+                            MPerBlock,
+                            NPerBlock / ClusterLengthNPerBlock,
+                            MPerBlock / ClusterLengthMPerBlock,
+                            Sequence<1, 0>,
+                            Sequence<TransposeTransferInScalarPerVectorAligned>,
+                            Sequence<CDEBlockTransferScalarPerVector_NPerBlock>,
+                            I1,
+                            I0>;
+
+    using GridwiseElementwiseWeightTranspose =
+        GridwiseElementwise<Tuple<GKCYXTransposeDescType>,
+                            Tuple<GKYXCTransposeDescType>,
+                            Tuple<const BDataType*>,
+                            Tuple<BDataType*>,
+                            Block2TileMapWeiElementwise,
+                            element_wise::PassThrough,
+                            ElementwiseBlocksize,
+                            MPerBlock,
+                            NPerBlock,
+                            MPerBlock / ClusterLengthMPerBlock,
+                            NPerBlock / ClusterLengthNPerBlock,
+                            Sequence<1, 0>,
+                            Sequence<1>,
+                            Sequence<CDEBlockTransferScalarPerVector_NPerBlock>,
+                            I0,
+                            I1>;
+
+    using GridwiseElementwiseOutputTranspose =
+        GridwiseElementwise<Tuple<NHWGCTransposeDescType>,
+                            Tuple<NGCHWTransposeDescType>,
+                            Tuple<const EDataType*>,
+                            Tuple<EDataType*>,
+                            Block2TileMapInOutElementwise,
+                            element_wise::PassThrough,
+                            ElementwiseBlocksize,
+                            NPerBlock,
+                            MPerBlock,
+                            NPerBlock / ClusterLengthNPerBlock,
+                            MPerBlock / ClusterLengthMPerBlock,
+                            Sequence<1, 0>,
+                            Sequence<CDEBlockTransferScalarPerVector_NPerBlock>,
+                            Sequence<TransposeTransferOutScalarPerVectorAligned>,
+                            I0,
+                            I1>;
     // Argument
     struct Argument : public BaseArgument
     {
@@ -398,7 +678,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                  const std::array<index_t, NDimSpatial>& input_right_pads,
                  const AElementwiseOp& a_element_op,
                  const BElementwiseOp& b_element_op,
-                 const CDEElementwiseOp& cde_element_op)
+                 const CDEElementwiseOp& cde_element_op,
+                 ck::index_t split_k = 1)
             : p_a_grid_{static_cast<const ADataType*>(p_a)},
               p_b_grid_{static_cast<const BDataType*>(p_b)},
               p_ds_grid_{},
@@ -409,10 +690,22 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
               cde_element_op_{cde_element_op},
               a_g_n_k_wos_lengths_{a_g_n_k_wos_lengths},
               b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths},
+              e_g_n_c_wis_lengths_{e_g_n_c_wis_lengths},
               conv_filter_strides_{conv_filter_strides},
               input_left_pads_{input_left_pads},
-              input_right_pads_{input_right_pads}
+              input_right_pads_{input_right_pads},
+              k_batch_{split_k}
         {
+            std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_strides_transposed =
+                conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(a_g_n_k_wos_lengths,
+                                                                      a_g_n_k_wos_strides);
+            std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_strides_transposed =
+                conv_ngchw_to_nhwgc_transformer.TransposeWeiStrides(b_g_k_c_xs_lengths,
+                                                                    b_g_k_c_xs_strides);
+            std::array<index_t, NDimSpatial + 3> e_g_n_c_wis_strides_transposed =
+                conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(e_g_n_c_wis_lengths,
+                                                                      e_g_n_c_wis_strides);
+
             // populate Ds pointer
             static_for<0, NumDTensor, 1>{}([&](auto i) {
                 using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
@@ -491,17 +784,19 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                             throw std::runtime_error("wrong! only implemented for 2D and 3D now");
                         }
 
-                        ConvToGemmBwdDataTransform conv_to_gemm_transform_{a_g_n_k_wos_lengths,
-                                                                           a_g_n_k_wos_strides,
-                                                                           b_g_k_c_xs_lengths,
-                                                                           b_g_k_c_xs_strides,
-                                                                           e_g_n_c_wis_lengths,
-                                                                           e_g_n_c_wis_strides,
-                                                                           conv_filter_strides,
-                                                                           conv_filter_dilations,
-                                                                           input_left_pads,
-                                                                           input_right_pads,
-                                                                           tildes};
+                        ConvToGemmBwdDataTransform conv_to_gemm_transform_{
+                            a_g_n_k_wos_lengths,
+                            a_g_n_k_wos_strides_transposed,
+                            b_g_k_c_xs_lengths,
+                            b_g_k_c_xs_strides_transposed,
+                            e_g_n_c_wis_lengths,
+                            e_g_n_c_wis_strides_transposed,
+                            conv_filter_strides,
+                            conv_filter_dilations,
+                            input_left_pads,
+                            input_right_pads,
+                            tildes,
+                            k_batch_};
 
                         conv_N_per_block_ = conv_to_gemm_transform_.N_;
 
@@ -527,17 +822,17 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                                                               KPerBlock,
                                                               DoPadGemmM,
                                                               DoPadGemmN,
-                                                              ALayout,
-                                                              BLayout,
+                                                              ALayoutAfterTranspose,
+                                                              BLayoutAfterTranspose,
                                                               DLayout,
                                                               true, /*SplitConvN*/
                                                               ABDataType,
                                                               DDataType>;
                             ConvToGemmBwdDataTransformD conv_to_gemm_transform_d{
                                 a_g_n_k_wos_lengths,
-                                a_g_n_k_wos_strides,
+                                a_g_n_k_wos_strides_transposed,
                                 b_g_k_c_xs_lengths,
-                                b_g_k_c_xs_strides,
+                                b_g_k_c_xs_strides_transposed,
                                 ds_g_n_c_wis_lengths[i],
                                 ds_g_n_c_wis_strides[i],
                                 conv_filter_strides,
@@ -557,46 +852,155 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                         const auto b_grid_desc_n_k =
                             transform_k0_m_k1_to_m_k(b_grid_desc_bk0_n_bk1);
 
-                        a_grid_desc_m_k_container_.push_back(a_grid_desc_m_k);
-                        b_grid_desc_n_k_container_.push_back(b_grid_desc_n_k);
-                        ds_grid_desc_m_n_container_.push_back(ds_grid_desc_m_n);
-                        e_grid_desc_m_n_container_.push_back(e_grid_desc_m_n);
+                        if constexpr(isMultiD)
+                        {
+                            a_grid_desc_m_k_container_.push_back(a_grid_desc_m_k);
+                            b_grid_desc_n_k_container_.push_back(b_grid_desc_n_k);
+                            ds_grid_desc_m_n_container_.push_back(ds_grid_desc_m_n);
+                            e_grid_desc_m_n_container_.push_back(e_grid_desc_m_n);
+                        }
 
                         // desc for blockwise copy
                         a_grid_desc_ak0_m_ak1_container_.push_back(a_grid_desc_ak0_m_ak1);
                         b_grid_desc_bk0_n_bk1_container_.push_back(b_grid_desc_bk0_n_bk1);
 
-                        // block-to-e-tile-map
-                        auto block_2_etile_map =
-                            GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n);
-
-                        block_2_etile_map_container_.push_back(block_2_etile_map);
-
-                        if(GridwiseGemm::CheckValidity(a_grid_desc_m_k,
-                                                       b_grid_desc_n_k,
-                                                       ds_grid_desc_m_n,
-                                                       e_grid_desc_m_n,
-                                                       block_2_etile_map))
+                        if constexpr(isMultiD)
                         {
-                            ds_grid_desc_mblock_mperblock_nblock_nperblock_container_.push_back(
-                                GridwiseGemm::
-                                    MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                                        ds_grid_desc_m_n));
+                            // block-to-e-tile-map
+                            auto block_2_etile_map =
+                                GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n);
 
+                            block_2_etile_map_container_.push_back(block_2_etile_map);
+
+                            if(GridwiseGemm::CheckValidity(a_grid_desc_m_k,
+                                                           b_grid_desc_n_k,
+                                                           ds_grid_desc_m_n,
+                                                           e_grid_desc_m_n,
+                                                           block_2_etile_map))
+                            {
+                                ds_grid_desc_mblock_mperblock_nblock_nperblock_container_.push_back(
+
+                                    GridwiseGemm::
+                                        MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                                            ds_grid_desc_m_n));
+
+                                e_grid_desc_mblock_mperblock_nblock_nperblock_container_.push_back(
+                                    MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                                        e_grid_desc_m_n));
+                            }
+                        }
+                        else
+                        {
+                            // there is no need to check since M, N, K are padded
                             e_grid_desc_mblock_mperblock_nblock_nperblock_container_.push_back(
-                                GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                                MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                                     e_grid_desc_m_n));
                         }
                     }
                 }
             }
             // A/B/Ds/E Batch Stride
-            compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides[0];
-            compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_k_c_xs_strides[0];
-            compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_c_wis_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides_transposed[0];
+            compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_k_c_xs_strides_transposed[0];
+            compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_c_wis_strides_transposed[0];
 
-            compute_ptr_offset_of_n_.BatchStrideA_ = a_g_n_k_wos_strides[1] * conv_N_per_block_;
-            compute_ptr_offset_of_n_.BatchStrideE_ = e_g_n_c_wis_strides[1] * conv_N_per_block_;
+            compute_ptr_offset_of_n_.BatchStrideA_ =
+                a_g_n_k_wos_strides_transposed[1] * conv_N_per_block_;
+            compute_ptr_offset_of_n_.BatchStrideE_ =
+                e_g_n_c_wis_strides_transposed[1] * conv_N_per_block_;
+
+            num_workgroups_per_Conv_N_ = a_g_n_k_wos_lengths_[I1] / conv_N_per_block_;
+
+            if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
+                         is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>())
+            {
+                // Use not modified base strides
+                a_in_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeNGCHWTransposeDesc<NDimSpatial>(
+                        a_g_n_k_wos_lengths, a_g_n_k_wos_strides, num_workgroups_per_Conv_N_);
+                a_out_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeNHWGCTransposeDesc<NDimSpatial>(
+                        a_g_n_k_wos_lengths, a_g_n_k_wos_strides, num_workgroups_per_Conv_N_);
+
+                b_in_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeGKCYXTransposeDesc<NDimSpatial>(
+                        b_g_k_c_xs_lengths, b_g_k_c_xs_strides);
+                b_out_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeGKYXCTransposeDesc<NDimSpatial>(
+                        b_g_k_c_xs_lengths, b_g_k_c_xs_strides);
+
+                e_in_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeNHWGCTransposeDesc<NDimSpatial>(
+                        e_g_n_c_wis_lengths, e_g_n_c_wis_strides, num_workgroups_per_Conv_N_);
+                e_out_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeNGCHWTransposeDesc<NDimSpatial>(
+                        e_g_n_c_wis_lengths, e_g_n_c_wis_strides, num_workgroups_per_Conv_N_);
+
+                elementwise_block_2_ctile_map_transpose_a_ = Block2TileMapInOutElementwise{
+                    a_in_transpose_desc_.GetLength(I0), a_in_transpose_desc_.GetLength(I1)};
+                elementwise_block_2_ctile_map_transpose_b_ = Block2TileMapWeiElementwise{
+                    b_in_transpose_desc_.GetLength(I0), b_in_transpose_desc_.GetLength(I1)};
+                elementwise_block_2_ctile_map_transpose_e_ = Block2TileMapInOutElementwise{
+                    e_in_transpose_desc_.GetLength(I0), e_in_transpose_desc_.GetLength(I1)};
+
+                compute_ptr_offset_of_workspace_n_.BatchStrideA_ =
+                    a_g_n_k_wos_strides[1] * conv_N_per_block_;
+                compute_ptr_offset_of_workspace_n_.BatchStrideE_ =
+                    e_g_n_c_wis_strides[1] * conv_N_per_block_;
+            }
+        }
+
+        std::size_t GetWorkspaceATensorSizeBytes() const
+        {
+            if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
+                         is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>())
+            {
+                const long_index_t a_acum = ck::accumulate_n<long_index_t>(
+                    a_g_n_k_wos_lengths_.begin(), NDimSpatial + I3, 1, std::multiplies<>());
+                // Align to 128B
+                return math::integer_divide_ceil(sizeof(ADataType) * a_acum, 128) * 128;
+            }
+            else
+            {
+                return 0;
+            }
+        }
+
+        std::size_t GetWorkspaceBTensorSizeBytes() const
+        {
+            if constexpr(is_NGCHW_GKCYX_NGKHW<ELayout, BLayout, ALayout>() ||
+                         is_NGCDHW_GKCZYX_NGKDHW<ELayout, BLayout, ALayout>())
+            {
+                const long_index_t b_acum = ck::accumulate_n<long_index_t>(
+                    b_g_k_c_xs_lengths_.begin(), NDimSpatial + I3, 1, std::multiplies<>());
+                // Align to 128B
+                return math::integer_divide_ceil(sizeof(BDataType) * b_acum, 128) * 128;
+            }
+            else
+            {
+                return 0;
+            }
+        }
+
+        std::size_t GetWorkspaceETensorSizeBytes() const
+        {
+            if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
+                         is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>())
+            {
+                const long_index_t e_accum = ck::accumulate_n<long_index_t>(
+                    e_g_n_c_wis_lengths_.begin(), NDimSpatial + I3, 1, std::multiplies<>());
+                return sizeof(EDataType) * e_accum;
+            }
+            else
+            {
+                return 0;
+            }
+        }
+
+        std::size_t GetWorkspaceSizeBytes() const
+        {
+            return GetWorkspaceATensorSizeBytes() + GetWorkspaceBTensorSizeBytes() +
+                   GetWorkspaceETensorSizeBytes();
         }
 
         void Print() const
@@ -624,7 +1028,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
         // pointers
         const ADataType* p_a_grid_;
         const BDataType* p_b_grid_;
-        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        typename GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmMultiDTemplateParams>::DsGridPointer
+            p_ds_grid_;
         EDataType* p_e_grid_;
 
         // tensor descriptor for problem definition
@@ -645,10 +1050,19 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
         // block-to-e-tile map
         std::vector<Block2ETileMap> block_2_etile_map_container_;
+        Block2TileMapInOutElementwise elementwise_block_2_ctile_map_transpose_a_,
+            elementwise_block_2_ctile_map_transpose_e_;
+        Block2TileMapWeiElementwise elementwise_block_2_ctile_map_transpose_b_;
+
+        NGCHWTransposeDescType a_in_transpose_desc_, e_out_transpose_desc_;
+        NHWGCTransposeDescType a_out_transpose_desc_, e_in_transpose_desc_;
+        GKCYXTransposeDescType b_in_transpose_desc_;
+        GKYXCTransposeDescType b_out_transpose_desc_;
 
         // for computing batch offset
         ComputePtrOffsetOfStridedBatch<I1, I1, NumDTensor> compute_ptr_offset_of_batch_;
         ComputePtrOffsetOfStridedBatch<I1, I1, I0> compute_ptr_offset_of_n_;
+        ComputePtrOffsetOfStridedBatch<I1, I1, I0> compute_ptr_offset_of_workspace_n_;
 
         // element-wise op
         AElementwiseOp a_element_op_;
@@ -657,9 +1071,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
         std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_lengths_;
         std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_lengths_;
+        std::array<index_t, NDimSpatial + 3> e_g_n_c_wis_lengths_;
         std::array<index_t, NDimSpatial> conv_filter_strides_;
         std::array<index_t, NDimSpatial> input_left_pads_;
         std::array<index_t, NDimSpatial> input_right_pads_;
+
+        const index_t k_batch_;
+        index_t num_workgroups_per_Conv_N_;
     };
 
     // Invoker
@@ -667,19 +1085,34 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        float RunMultiDGemm(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
-            if(stream_config.log_level_ > 0)
-            {
-                arg.Print();
-            }
+            float ave_time = 0;
 
             const index_t gdy = arg.num_group_;
-            const index_t num_workgroups_per_Conv_N =
-                arg.a_g_n_k_wos_lengths_[I1] / arg.conv_N_per_block_;
-            const index_t gdz = num_workgroups_per_Conv_N;
+            const index_t gdz = arg.num_workgroups_per_Conv_N_;
+
+            const ADataType* p_a_grid = arg.p_a_grid_;
+            const BDataType* p_b_grid = arg.p_b_grid_;
+            EDataType* p_e_grid       = arg.p_e_grid_;
+
+            if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
+                         is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>())
+            {
+                p_a_grid = type_convert<const ADataType*>(arg.p_workspace_);
+                p_e_grid =
+                    type_convert<EDataType*>(arg.p_workspace_) +
+                    (arg.GetWorkspaceATensorSizeBytes() + arg.GetWorkspaceBTensorSizeBytes()) /
+                        sizeof(EDataType);
+            }
+
+            if constexpr(is_NGCHW_GKCYX_NGKHW<ELayout, BLayout, ALayout>() ||
+                         is_NGCDHW_GKCZYX_NGKDHW<ELayout, BLayout, ALayout>())
+            {
+                p_b_grid = type_convert<const BDataType*>(arg.p_workspace_) +
+                           arg.GetWorkspaceATensorSizeBytes() / sizeof(BDataType);
+            }
 
-            float ave_time = 0;
             for(std::size_t i = 0; i < arg.a_grid_desc_ak0_m_ak1_container_.size(); i++)
             {
                 if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_container_[i],
@@ -722,10 +1155,10 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                         dim3(gdx, gdy, gdz),
                         dim3(BlockSize),
                         0,
-                        arg.p_a_grid_,
-                        arg.p_b_grid_,
+                        p_a_grid,
+                        p_b_grid,
                         arg.p_ds_grid_,
-                        arg.p_e_grid_,
+                        p_e_grid,
                         arg.a_element_op_,
                         arg.b_element_op_,
                         arg.cde_element_op_,
@@ -751,6 +1184,823 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
             return ave_time;
         }
 
+        float RunGemmV3(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            float ave_time = 0;
+
+            const ADataType* p_a_grid = arg.p_a_grid_;
+            const BDataType* p_b_grid = arg.p_b_grid_;
+            EDataType* p_e_grid       = arg.p_e_grid_;
+
+            if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
+                         is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>())
+            {
+                p_a_grid = type_convert<const ADataType*>(arg.p_workspace_);
+                p_e_grid =
+                    type_convert<EDataType*>(arg.p_workspace_) +
+                    (arg.GetWorkspaceATensorSizeBytes() + arg.GetWorkspaceBTensorSizeBytes()) /
+                        sizeof(EDataType);
+            }
+
+            if constexpr(is_NGCHW_GKCYX_NGKHW<ELayout, BLayout, ALayout>() ||
+                         is_NGCDHW_GKCZYX_NGKDHW<ELayout, BLayout, ALayout>())
+            {
+                p_b_grid = type_convert<const BDataType*>(arg.p_workspace_) +
+                           arg.GetWorkspaceATensorSizeBytes() / sizeof(BDataType);
+            }
+
+            constexpr index_t minimum_occupancy =
+                BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave ? 1 : 2;
+
+            for(std::size_t i = 0; i < arg.a_grid_desc_ak0_m_ak1_container_.size(); i++)
+            {
+                const index_t GemmM = arg.a_grid_desc_ak0_m_ak1_container_[i].GetLength(I1);
+                const index_t GemmN = arg.b_grid_desc_bk0_n_bk1_container_[i].GetLength(I1);
+                const index_t GemmK = arg.a_grid_desc_ak0_m_ak1_container_[i].GetLength(I0) *
+                                      arg.a_grid_desc_ak0_m_ak1_container_[i].GetLength(I2);
+
+                const auto num_k_per_block =
+                    arg.a_grid_desc_ak0_m_ak1_container_[i].GetLength(Number<0>{}) / arg.k_batch_;
+
+                // gdy is for the kbatch and num_workgrups_per_Conv_N
+                index_t gdx, gdy, gdz;
+                std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(
+                    GemmM, GemmN, arg.k_batch_ * arg.num_workgroups_per_Conv_N_, arg.num_group_);
+
+                index_t k_grain = arg.k_batch_ * KPerBlock;
+                index_t K_split = (GemmK + k_grain - 1) / k_grain * KPerBlock;
+                const bool has_main_k_block_loop =
+                    GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
+
+                typename GridwiseGemm::Argument gemm_arg{
+                    p_a_grid, p_b_grid, p_e_grid, GemmM, GemmN, GemmK, I0, I0, I0, arg.k_batch_};
+
+                const auto Run = [&](const auto& kernel) {
+                    if(stream_config.flush_cache)
+                    {
+                        typename GridwiseGemm::Argument gemm_arg_ = gemm_arg;
+                        ck::utility::RotatingMemWrapper<typename GridwiseGemm::Argument>
+                            rotating_mem(gemm_arg_,
+                                         stream_config.rotating_count,
+                                         gemm_arg_.M * gemm_arg_.K * sizeof(ADataType),
+                                         gemm_arg_.K * gemm_arg_.N * sizeof(BDataType));
+                        rotating_mem.Print();
+
+                        auto run_flush_cache = [&]() {
+                            // flush icache
+                            ck::utility::flush_icache();
+                            // rotating mem
+                            rotating_mem.Next();
+                        };
+
+                        ave_time += ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                            stream_config,
+                            run_flush_cache,
+                            kernel,
+                            dim3(gdx, gdy, gdz),
+                            dim3(BlockSize),
+                            0,
+                            gemm_arg_,
+                            arg.a_grid_desc_ak0_m_ak1_container_[i],
+                            arg.b_grid_desc_bk0_n_bk1_container_[i],
+                            arg.e_grid_desc_mblock_mperblock_nblock_nperblock_container_[i],
+                            arg.compute_ptr_offset_of_batch_,
+                            arg.compute_ptr_offset_of_n_,
+                            num_k_per_block);
+                    }
+                    else
+                    {
+                        ave_time += launch_and_time_kernel(
+                            stream_config,
+                            kernel,
+                            dim3(gdx, gdy, gdz),
+                            dim3(BlockSize),
+                            0,
+                            gemm_arg,
+                            arg.a_grid_desc_ak0_m_ak1_container_[i],
+                            arg.b_grid_desc_bk0_n_bk1_container_[i],
+                            arg.e_grid_desc_mblock_mperblock_nblock_nperblock_container_[i],
+                            arg.compute_ptr_offset_of_batch_,
+                            arg.compute_ptr_offset_of_n_,
+                            num_k_per_block);
+                    }
+                };
+
+                if(has_main_k_block_loop)
+                {
+                    // Tail number always full
+                    if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1 ||
+                                 BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                    {
+                        if(gemm_arg.KBatch > 1)
+                        {
+                            if constexpr(IsSplitKSupported)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    DeviceOp::AGridDesc_AK0_M_AK1,
+                                    DeviceOp::BGridDesc_BK0_N_BK1,
+                                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy>;
+                                Run(kernel);
+                            }
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                DeviceOp::AGridDesc_AK0_M_AK1,
+                                DeviceOp::BGridDesc_BK0_N_BK1,
+                                DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy>;
+                            Run(kernel);
+                        }
+                    }
+                    // Tail number could be One to Seven
+                    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
+                    {
+                        if(gemm_arg.KBatch > 1)
+                        {
+                            if constexpr(IsSplitKSupported)
+                            {
+                                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                   TailNumber::One)
+                                {
+                                    const auto kernel =
+                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                            GridwiseGemm,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            true,
+                                            InMemoryDataOperationEnum::AtomicAdd,
+                                            minimum_occupancy,
+                                            TailNumber::One>;
+                                    Run(kernel);
+                                }
+                                else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                        TailNumber::Full)
+                                {
+                                    const auto kernel =
+                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                            GridwiseGemm,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            true,
+                                            InMemoryDataOperationEnum::AtomicAdd,
+                                            minimum_occupancy,
+                                            TailNumber::Full>;
+                                    Run(kernel);
+                                }
+
+                                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
+                                {
+                                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                       TailNumber::Two)
+                                    {
+                                        const auto kernel =
+                                            kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                                GridwiseGemm,
+                                                DeviceOp::AGridDesc_AK0_M_AK1,
+                                                DeviceOp::BGridDesc_BK0_N_BK1,
+                                                DeviceOp::
+                                                    EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                                true,
+                                                InMemoryDataOperationEnum::AtomicAdd,
+                                                minimum_occupancy,
+                                                TailNumber::Two>;
+                                        Run(kernel);
+                                    }
+                                }
+
+                                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
+                                {
+                                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                       TailNumber::Three)
+                                    {
+                                        const auto kernel =
+                                            kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                                GridwiseGemm,
+                                                DeviceOp::AGridDesc_AK0_M_AK1,
+                                                DeviceOp::BGridDesc_BK0_N_BK1,
+                                                DeviceOp::
+                                                    EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                                true,
+                                                InMemoryDataOperationEnum::AtomicAdd,
+                                                minimum_occupancy,
+                                                TailNumber::Three>;
+                                        Run(kernel);
+                                    }
+                                }
+
+                                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
+                                {
+                                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                       TailNumber::Four)
+                                    {
+                                        const auto kernel =
+                                            kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                                GridwiseGemm,
+                                                DeviceOp::AGridDesc_AK0_M_AK1,
+                                                DeviceOp::BGridDesc_BK0_N_BK1,
+                                                DeviceOp::
+                                                    EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                                true,
+                                                InMemoryDataOperationEnum::AtomicAdd,
+                                                minimum_occupancy,
+                                                TailNumber::Four>;
+                                        Run(kernel);
+                                    }
+                                }
+
+                                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
+                                {
+                                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                       TailNumber::Five)
+                                    {
+                                        const auto kernel =
+                                            kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                                GridwiseGemm,
+                                                DeviceOp::AGridDesc_AK0_M_AK1,
+                                                DeviceOp::BGridDesc_BK0_N_BK1,
+                                                DeviceOp::
+                                                    EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                                true,
+                                                InMemoryDataOperationEnum::AtomicAdd,
+                                                minimum_occupancy,
+                                                TailNumber::Five>;
+                                        Run(kernel);
+                                    }
+                                }
+
+                                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
+                                {
+                                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                       TailNumber::Six)
+                                    {
+                                        const auto kernel =
+                                            kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                                GridwiseGemm,
+                                                DeviceOp::AGridDesc_AK0_M_AK1,
+                                                DeviceOp::BGridDesc_BK0_N_BK1,
+                                                DeviceOp::
+                                                    EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                                true,
+                                                InMemoryDataOperationEnum::AtomicAdd,
+                                                minimum_occupancy,
+                                                TailNumber::Six>;
+                                        Run(kernel);
+                                    }
+                                }
+
+                                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
+                                {
+                                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                       TailNumber::Seven)
+                                    {
+                                        const auto kernel =
+                                            kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                                GridwiseGemm,
+                                                DeviceOp::AGridDesc_AK0_M_AK1,
+                                                DeviceOp::BGridDesc_BK0_N_BK1,
+                                                DeviceOp::
+                                                    EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                                true,
+                                                InMemoryDataOperationEnum::AtomicAdd,
+                                                minimum_occupancy,
+                                                TailNumber::Seven>;
+                                        Run(kernel);
+                                    }
+                                }
+                            }
+                        }
+                        else
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    DeviceOp::AGridDesc_AK0_M_AK1,
+                                    DeviceOp::BGridDesc_BK0_N_BK1,
+                                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::One>;
+                                Run(kernel);
+                            }
+                            else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                    TailNumber::Full)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    DeviceOp::AGridDesc_AK0_M_AK1,
+                                    DeviceOp::BGridDesc_BK0_N_BK1,
+                                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Full>;
+                                Run(kernel);
+                            }
+
+                            if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
+                            {
+                                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                   TailNumber::Two)
+                                {
+                                    const auto kernel =
+                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                            GridwiseGemm,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            true,
+                                            InMemoryDataOperationEnum::Set,
+                                            minimum_occupancy,
+                                            TailNumber::Two>;
+                                    Run(kernel);
+                                }
+                            }
+
+                            if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
+                            {
+                                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                   TailNumber::Three)
+                                {
+                                    const auto kernel =
+                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                            GridwiseGemm,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            true,
+                                            InMemoryDataOperationEnum::Set,
+                                            minimum_occupancy,
+                                            TailNumber::Three>;
+                                    Run(kernel);
+                                }
+                            }
+
+                            if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
+                            {
+                                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                   TailNumber::Four)
+                                {
+                                    const auto kernel =
+                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                            GridwiseGemm,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            true,
+                                            InMemoryDataOperationEnum::Set,
+                                            minimum_occupancy,
+                                            TailNumber::Four>;
+                                    Run(kernel);
+                                }
+                            }
+
+                            if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
+                            {
+                                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                   TailNumber::Five)
+                                {
+                                    const auto kernel =
+                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                            GridwiseGemm,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            true,
+                                            InMemoryDataOperationEnum::Set,
+                                            minimum_occupancy,
+                                            TailNumber::Five>;
+                                    Run(kernel);
+                                }
+                            }
+
+                            if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
+                            {
+                                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                   TailNumber::Six)
+                                {
+                                    const auto kernel =
+                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                            GridwiseGemm,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            true,
+                                            InMemoryDataOperationEnum::Set,
+                                            minimum_occupancy,
+                                            TailNumber::Six>;
+                                    Run(kernel);
+                                }
+                            }
+
+                            if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
+                            {
+                                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                   TailNumber::Seven)
+                                {
+                                    const auto kernel =
+                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                            GridwiseGemm,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            true,
+                                            InMemoryDataOperationEnum::Set,
+                                            minimum_occupancy,
+                                            TailNumber::Seven>;
+                                    Run(kernel);
+                                }
+                            }
+                        }
+                    }
+                    // Tail number could be Odd or Even
+                    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+                    {
+                        if(gemm_arg.KBatch > 1)
+                        {
+                            if constexpr(IsSplitKSupported)
+                            {
+                                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                   TailNumber::Odd)
+                                {
+                                    const auto kernel =
+                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3_2lds<
+                                            GridwiseGemm,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            true,
+                                            InMemoryDataOperationEnum::AtomicAdd,
+                                            minimum_occupancy,
+                                            TailNumber::Odd>;
+                                    Run(kernel);
+                                }
+                                else
+                                {
+                                    const auto kernel =
+                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3_2lds<
+                                            GridwiseGemm,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            true,
+                                            InMemoryDataOperationEnum::AtomicAdd,
+                                            minimum_occupancy,
+                                            TailNumber::Even>;
+                                    Run(kernel);
+                                }
+                            }
+                        }
+                        else
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                            {
+                                const auto kernel =
+                                    kernel_grouped_conv_bwd_data_xdl_cshuffle_v3_2lds<
+                                        GridwiseGemm,
+                                        DeviceOp::AGridDesc_AK0_M_AK1,
+                                        DeviceOp::BGridDesc_BK0_N_BK1,
+                                        DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                        ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                        ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                        true,
+                                        InMemoryDataOperationEnum::Set,
+                                        minimum_occupancy,
+                                        TailNumber::Odd>;
+                                Run(kernel);
+                            }
+                            else
+                            {
+                                const auto kernel =
+                                    kernel_grouped_conv_bwd_data_xdl_cshuffle_v3_2lds<
+                                        GridwiseGemm,
+                                        DeviceOp::AGridDesc_AK0_M_AK1,
+                                        DeviceOp::BGridDesc_BK0_N_BK1,
+                                        DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                        ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                        ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                        true,
+                                        InMemoryDataOperationEnum::Set,
+                                        minimum_occupancy,
+                                        TailNumber::Even>;
+                                Run(kernel);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        if(gemm_arg.KBatch > 1)
+                        {
+                            if constexpr(IsSplitKSupported)
+                            {
+                                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                   TailNumber::Odd)
+                                {
+                                    const auto kernel =
+                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                            GridwiseGemm,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            true,
+                                            InMemoryDataOperationEnum::AtomicAdd,
+                                            minimum_occupancy,
+                                            TailNumber::Odd>;
+                                    Run(kernel);
+                                }
+                                else
+                                {
+                                    const auto kernel =
+                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                            GridwiseGemm,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            true,
+                                            InMemoryDataOperationEnum::AtomicAdd,
+                                            minimum_occupancy,
+                                            TailNumber::Even>;
+                                    Run(kernel);
+                                }
+                            }
+                        }
+                        else
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    DeviceOp::AGridDesc_AK0_M_AK1,
+                                    DeviceOp::BGridDesc_BK0_N_BK1,
+                                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Odd>;
+                                Run(kernel);
+                            }
+                            else
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    DeviceOp::AGridDesc_AK0_M_AK1,
+                                    DeviceOp::BGridDesc_BK0_N_BK1,
+                                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Even>;
+                                Run(kernel);
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    // Tail number always 1
+                    if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                    {
+                        if(gemm_arg.KBatch > 1)
+                        {
+                            if constexpr(IsSplitKSupported)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    DeviceOp::AGridDesc_AK0_M_AK1,
+                                    DeviceOp::BGridDesc_BK0_N_BK1,
+                                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    false,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy>;
+                                Run(kernel);
+                            }
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                DeviceOp::AGridDesc_AK0_M_AK1,
+                                DeviceOp::BGridDesc_BK0_N_BK1,
+                                DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                false,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy>;
+                            Run(kernel);
+                        }
+                    }
+                }
+            }
+            return ave_time;
+        }
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            float ave_time = 0;
+
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+            }
+            // Transpose from NGKHW to NHWGK
+            if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
+                         is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>())
+            {
+                EDataType* p_e_in_grid =
+                    type_convert<EDataType*>(arg.p_workspace_) +
+                    (arg.GetWorkspaceATensorSizeBytes() + arg.GetWorkspaceBTensorSizeBytes()) /
+                        sizeof(EDataType);
+
+                const auto clear_workspace = [&]() {
+                    hip_check_error(hipMemsetAsync(p_e_in_grid,
+                                                   0,
+                                                   arg.GetWorkspaceETensorSizeBytes(),
+                                                   stream_config.stream_id_));
+                };
+
+                const index_t a_grid_size =
+                    arg.elementwise_block_2_ctile_map_transpose_a_.CalculateGridSize(
+                        arg.a_in_transpose_desc_) *
+                    arg.num_workgroups_per_Conv_N_;
+                const index_t b_grid_size =
+                    (is_NGCHW_GKCYX_NGKHW<ELayout, BLayout, ALayout>() ||
+                     is_NGCDHW_GKCZYX_NGKDHW<ELayout, BLayout, ALayout>())
+                        ? arg.elementwise_block_2_ctile_map_transpose_b_.CalculateGridSize(
+                              arg.b_in_transpose_desc_)
+                        : 0; // Dont run transpose B if not needed
+
+                ADataType* p_a_out_grid = type_convert<ADataType*>(arg.p_workspace_);
+                BDataType* p_b_out_grid = type_convert<BDataType*>(arg.p_workspace_) +
+                                          arg.GetWorkspaceATensorSizeBytes() / sizeof(BDataType);
+
+                auto kernel_transpose =
+                    kernel_elementwise_batched_dual<GridwiseElementwiseInputTranspose,
+                                                    GridwiseElementwiseWeightTranspose,
+                                                    ck::Tuple<NGCHWTransposeDescType>,
+                                                    ck::Tuple<GKCYXTransposeDescType>,
+                                                    ck::Tuple<NHWGCTransposeDescType>,
+                                                    ck::Tuple<GKYXCTransposeDescType>,
+                                                    ck::Tuple<const ADataType*>,
+                                                    ck::Tuple<const BDataType*>,
+                                                    ck::Tuple<ADataType*>,
+                                                    ck::Tuple<BDataType*>,
+                                                    Block2TileMapInOutElementwise,
+                                                    Block2TileMapWeiElementwise,
+                                                    element_wise::PassThrough,
+                                                    I1,
+                                                    I1,
+                                                    I1,
+                                                    I1>;
+
+                ave_time += launch_and_time_kernel_with_preprocess(
+                    stream_config,
+                    clear_workspace,
+                    kernel_transpose,
+                    dim3(a_grid_size + b_grid_size),
+                    dim3(ElementwiseBlocksize),
+                    0,
+                    make_tuple(arg.a_in_transpose_desc_),
+                    make_tuple(arg.b_in_transpose_desc_),
+                    make_tuple(arg.a_out_transpose_desc_),
+                    make_tuple(arg.b_out_transpose_desc_),
+                    make_tuple(arg.p_a_grid_),
+                    make_tuple(arg.p_b_grid_),
+                    make_tuple(p_a_out_grid),
+                    make_tuple(p_b_out_grid),
+                    arg.elementwise_block_2_ctile_map_transpose_a_,
+                    arg.elementwise_block_2_ctile_map_transpose_b_,
+                    element_wise::PassThrough{},
+                    a_grid_size,
+                    arg.num_workgroups_per_Conv_N_,
+                    I1, // B is not splited per N
+                    std::array<index_t, I1>{
+                        static_cast<index_t>(arg.compute_ptr_offset_of_workspace_n_.BatchStrideA_)},
+                    std::array<index_t, I1>{0},
+                    std::array<index_t, I1>{
+                        static_cast<index_t>(arg.compute_ptr_offset_of_n_.BatchStrideA_)},
+                    std::array<index_t, I1>{0});
+            }
+
+            if constexpr(isMultiD)
+            {
+                ave_time += RunMultiDGemm(arg, stream_config);
+            }
+            else
+            {
+                ave_time += RunGemmV3(arg, stream_config);
+            }
+
+            // Transpose from NHWGC to NGCHW
+            if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
+                         is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>())
+            {
+                const index_t grid_size =
+                    arg.elementwise_block_2_ctile_map_transpose_e_.CalculateGridSize(
+                        arg.e_in_transpose_desc_) *
+                    arg.num_workgroups_per_Conv_N_;
+
+                const EDataType* p_e_in_grid =
+                    type_convert<EDataType*>(arg.p_workspace_) +
+                    (arg.GetWorkspaceATensorSizeBytes() + arg.GetWorkspaceBTensorSizeBytes()) /
+                        sizeof(EDataType);
+
+                EDataType* p_e_out_grid = arg.p_e_grid_;
+
+                auto kernel_transpose =
+                    kernel_batched_elementwise<GridwiseElementwiseOutputTranspose,
+                                               ck::Tuple<NHWGCTransposeDescType>,
+                                               ck::Tuple<NGCHWTransposeDescType>,
+                                               ck::Tuple<const EDataType*>,
+                                               ck::Tuple<EDataType*>,
+                                               Block2TileMapInOutElementwise,
+                                               element_wise::PassThrough,
+                                               I1,
+                                               I1>;
+
+                ave_time += launch_and_time_kernel(
+                    stream_config,
+                    kernel_transpose,
+                    dim3(grid_size),
+                    dim3(ElementwiseBlocksize),
+                    0,
+                    make_tuple(arg.e_in_transpose_desc_),
+                    make_tuple(arg.e_out_transpose_desc_),
+                    make_tuple(p_e_in_grid),
+                    make_tuple(p_e_out_grid),
+                    arg.elementwise_block_2_ctile_map_transpose_e_,
+                    element_wise::PassThrough{},
+                    arg.num_workgroups_per_Conv_N_,
+                    std::array<index_t, I1>{
+                        static_cast<index_t>(arg.compute_ptr_offset_of_n_.BatchStrideE_)},
+                    std::array<index_t, I1>{static_cast<index_t>(
+                        arg.compute_ptr_offset_of_workspace_n_.BatchStrideE_)});
+            }
+
+            return ave_time;
+        }
+
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
         {
@@ -765,9 +2015,47 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
             return false;
         }
 
+        if(!is_bf16_atomic_supported() && std::is_same_v<EDataType, ck::bhalf_t> &&
+           arg.k_batch_ > 1)
+        {
+            return false;
+        }
+
+        if constexpr(!IsSplitKSupported)
+        {
+            if(arg.k_batch_ != 1)
+            {
+                return false;
+            }
+        }
+
+        const index_t ConvG = arg.b_g_k_c_xs_lengths_[0];
         const index_t ConvK = arg.b_g_k_c_xs_lengths_[1];
         const index_t ConvC = arg.b_g_k_c_xs_lengths_[2];
 
+        if constexpr(!isMultiD)
+        {
+            for(std::size_t i = 0; i < arg.a_grid_desc_ak0_m_ak1_container_.size(); i++)
+            {
+                const index_t GemmM = arg.a_grid_desc_ak0_m_ak1_container_[i].GetLength(I1);
+                const index_t GemmN = arg.b_grid_desc_bk0_n_bk1_container_[i].GetLength(I1);
+                const index_t GemmK = arg.a_grid_desc_ak0_m_ak1_container_[i].GetLength(I0) *
+                                      arg.a_grid_desc_ak0_m_ak1_container_[i].GetLength(I2);
+
+                typename GridwiseGemm::Argument gemm_arg{
+                    nullptr, nullptr, nullptr, GemmM, GemmN, GemmK, I0, I0, I0, arg.k_batch_};
+
+                const auto num_k_loop = gemm_arg.AK0 / (KPerBlock / AK1);
+                if constexpr(BlkGemmPipelineVer != BlockGemmPipelineVersion::v1)
+                {
+                    if(num_k_loop <= GridwiseGemm::BlockwiseGemmPipe::PrefetchStages)
+                    {
+                        return false;
+                    }
+                }
+            }
+        }
+
         // Specifialization
         if constexpr(ConvBackwardDataSpecialization ==
                      ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0)
@@ -787,7 +2075,9 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
         if constexpr(is_same_v<ALayout, tensor_layout::convolution::GNHWK> ||
                      is_same_v<ALayout, tensor_layout::convolution::GNDHWK> ||
                      is_same_v<ALayout, tensor_layout::convolution::NHWGK> ||
-                     is_same_v<ALayout, tensor_layout::convolution::NDHWGK>)
+                     is_same_v<ALayout, tensor_layout::convolution::NDHWGK> ||
+                     is_same_v<ALayout, tensor_layout::convolution::NGKHW> ||
+                     is_same_v<ALayout, tensor_layout::convolution::NGKDHW>)
         {
             if(!(ABlockTransferSrcVectorDim == 2 && ConvK % ABlockTransferSrcScalarPerVector == 0))
             {
@@ -801,7 +2091,9 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
         // vector load for B matrix from global memory to LDS
         if constexpr(is_same_v<BLayout, tensor_layout::convolution::GKYXC> ||
-                     is_same_v<BLayout, tensor_layout::convolution::GKZYXC>)
+                     is_same_v<BLayout, tensor_layout::convolution::GKZYXC> ||
+                     is_same_v<BLayout, tensor_layout::convolution::GKCYX> ||
+                     is_same_v<BLayout, tensor_layout::convolution::GKCZYX>)
         {
             if(!(BBlockTransferSrcVectorDim == 1 && ConvC % BBlockTransferSrcScalarPerVector == 0))
             {
@@ -848,7 +2140,9 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
         if constexpr(is_same_v<ELayout, tensor_layout::convolution::GNHWC> ||
                      is_same_v<ELayout, tensor_layout::convolution::GNDHWC> ||
                      is_same_v<ELayout, tensor_layout::convolution::NHWGC> ||
-                     is_same_v<ELayout, tensor_layout::convolution::NDHWGC>)
+                     is_same_v<ELayout, tensor_layout::convolution::NDHWGC> ||
+                     is_same_v<ELayout, tensor_layout::convolution::NGCHW> ||
+                     is_same_v<ELayout, tensor_layout::convolution::NGCDHW>)
         {
             // vector store C matrix into global memory
             if(!(ConvC % CDEBlockTransferScalarPerVector_NPerBlock == 0))
@@ -864,12 +2158,57 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
         // Gridwise GEMM size
         for(std::size_t i = 0; i < arg.a_grid_desc_ak0_m_ak1_container_.size(); i++)
         {
-            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_container_[i],
-                                            arg.b_grid_desc_n_k_container_[i],
-                                            arg.ds_grid_desc_m_n_container_[i],
-                                            arg.e_grid_desc_m_n_container_[i],
-                                            arg.block_2_etile_map_container_[i]))
+            if constexpr(isMultiD)
             {
+                if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_container_[i],
+                                                arg.b_grid_desc_n_k_container_[i],
+                                                arg.ds_grid_desc_m_n_container_[i],
+                                                arg.e_grid_desc_m_n_container_[i],
+                                                arg.block_2_etile_map_container_[i]))
+                {
+                    return false;
+                }
+            }
+        }
+
+        if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
+                     is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>())
+        {
+            if((ConvG * ConvC) % CDEBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                return false;
+            }
+
+            if((ConvG * ConvK) % CDEBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                return false;
+            }
+
+            const index_t a_spatial_acum = ck::accumulate_n<index_t>(
+                arg.a_g_n_k_wos_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>());
+            const index_t e_spatial_acum = ck::accumulate_n<index_t>(
+                arg.e_g_n_c_wis_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>());
+
+            if(a_spatial_acum % TransposeTransferInScalarPerVectorAligned != 0)
+            {
+                return false;
+            }
+
+            if(e_spatial_acum % TransposeTransferOutScalarPerVectorAligned != 0)
+            {
+                return false;
+            }
+
+            if(!arg.p_workspace_)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout
+                        << "Warning: Workspace for "
+                           "DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1::Argument is not "
+                           "allocated, use SetWorkSpacePointer."
+                        << std::endl;
+                }
                 return false;
             }
         }
@@ -903,7 +2242,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                  const std::array<index_t, NDimSpatial>& input_right_pads,
                  const AElementwiseOp& a_element_op,
                  const BElementwiseOp& b_element_op,
-                 const CDEElementwiseOp& cde_element_op)
+                 const CDEElementwiseOp& cde_element_op,
+                 const ck::index_t split_k = 1)
     {
         return Argument{p_a,
                         p_b,
@@ -923,7 +2263,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                         input_right_pads,
                         a_element_op,
                         b_element_op,
-                        cde_element_op};
+                        cde_element_op,
+                        split_k};
     }
 
     static auto MakeInvoker() { return Invoker{}; }
@@ -949,7 +2290,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
         const std::array<index_t, NDimSpatial>& input_right_pads,
         const AElementwiseOp& a_element_op,
         const BElementwiseOp& b_element_op,
-        const CDEElementwiseOp& cde_element_op) override
+        const CDEElementwiseOp& cde_element_op,
+        const ck::index_t split_k = 1) override
     {
         return std::make_unique<Argument>(p_a,
                                           p_b,
@@ -969,7 +2311,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                                           input_right_pads,
                                           a_element_op,
                                           b_element_op,
-                                          cde_element_op);
+                                          cde_element_op,
+                                          split_k);
     }
 
     std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
@@ -981,6 +2324,17 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
     {
         auto str = std::stringstream();
 
+        std::map<BlockGemmPipelineScheduler, std::string> BlkGemmPipelineSchedulerToString{
+            {BlockGemmPipelineScheduler::Intrawave, "Intrawave"},
+            {BlockGemmPipelineScheduler::Interwave, "Interwave"}};
+
+        std::map<BlockGemmPipelineVersion, std::string> BlkGemmPipelineVersionToString{
+            {BlockGemmPipelineVersion::v1, "v1"},
+            {BlockGemmPipelineVersion::v2, "v2"},
+            {BlockGemmPipelineVersion::v3, "v3"},
+            {BlockGemmPipelineVersion::v4, "v4"},
+            {BlockGemmPipelineVersion::v5, "v5"}};
+
         // clang-format off
         str << "DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1"
             << "<"
@@ -998,11 +2352,52 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
             << ABlockTransferSrcScalarPerVector << ", "
             << BBlockTransferSrcScalarPerVector << ", "
             << CShuffleMXdlPerWavePerShuffle << ", "
-            << CShuffleNXdlPerWavePerShuffle
-            << ">";
+            << CShuffleNXdlPerWavePerShuffle << ", "
+            << "BlkGemmPipelineScheduler: "
+            << BlkGemmPipelineSchedulerToString[BlkGemmPipeSched] << ", "
+            << "BlkGemmPipelineVersion: "
+            << BlkGemmPipelineVersionToString[BlkGemmPipelineVer];
+
+            if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
+                        is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>()) {
+                    str << ", TransposeTransferInScalarPerVectorAligned: "
+                    << TransposeTransferInScalarPerVectorAligned <<", "
+                    << "TransposeTransferOutScalarPerVectorAligned: " << TransposeTransferOutScalarPerVectorAligned;
+                }
+    
+                
+            str << ">";
 
         return str.str();
     }
+
+    size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override
+    {
+        auto arg = dynamic_cast<const Argument*>(p_arg);
+        if(arg)
+        {
+            return arg->GetWorkspaceSizeBytes();
+        }
+        else
+            throw std::runtime_error(
+                "The argument pointer is not an object of "
+                "DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1::Argument structure!");
+    }
+
+    void SetWorkSpacePointer(BaseArgument* p_arg,
+                             void* p_workspace,
+                             const StreamConfig& = StreamConfig{}) const override
+    {
+        auto p_arg_ = dynamic_cast<Argument*>(p_arg);
+        if(p_arg_)
+        {
+            p_arg_->p_workspace_ = p_workspace;
+        }
+        else
+            throw std::runtime_error(
+                "The argument pointer is not an object of "
+                "DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1::Argument structure!");
+    }
 };
 
 } // namespace device
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
index 795995d9a3..c7d95254c5 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
@@ -8,7 +8,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
-
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -19,7 +19,7 @@
 #include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_bwd_weight_v3.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp"
 #include <ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp>
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
@@ -218,8 +218,8 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
     using EDataType = WeiDataType;
 
     // If NGCHW then ADataType must be equal to BDataType
-    static_assert(!(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
-                    is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>()) ||
+    static_assert(!(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                    is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>()) ||
                   is_same_v<ADataType, BDataType>);
 
     using AElementwiseOperation   = OutElementwiseOperation;
@@ -376,6 +376,12 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
     using NHWGCTransposeDescType =
         remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
                                     .template MakeNHWGCTransposeDesc<NDimSpatial>({}, {}))>;
+    using GKCYXTransposeDescType =
+        remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
+                                    .template MakeGKCYXTransposeDesc<NDimSpatial>({}, {}))>;
+    using GKYXCTransposeDescType =
+        remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
+                                    .template MakeGKYXCTransposeDesc<NDimSpatial>({}, {}))>;
 
     using ABCGridDescs = decltype(GetABCGridDesc<NDimSpatial>());
 
@@ -452,6 +458,28 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
                             Sequence<CBlockTransferScalarPerVector_NWaveNPerXdl>,
                             I1,
                             I1>;
+    // NPerBlock is used for the first dim which is store dimension
+    // (with CBlockTransferScalarPerVector_NWaveNPerXdl scalar per vector).
+    // CBlockTransferScalarPerVector_NWaveNPerXdl is aligned to NPerBlock so
+    // it is more flexible to use this dim for store dimension with such scalar
+    // per vector.
+    using GridwiseElementwiseWeightTransposeCast =
+        GridwiseElementwise<Tuple<GKYXCTransposeDescType>,
+                            Tuple<GKCYXTransposeDescType>,
+                            Tuple<const AccDataType*>,
+                            Tuple<EDataType*>,
+                            Block2TileMapElementwise,
+                            CDEElementwiseOperation,
+                            BlockSize,
+                            MPerBlock,
+                            NPerBlock,
+                            MPerBlock / ClusterLengthMPerBlock,
+                            NPerBlock / ClusterLengthNPerBlock,
+                            Sequence<0, 1>,
+                            Sequence<CBlockTransferScalarPerVector_NWaveNPerXdl>,
+                            Sequence<1>,
+                            I1,
+                            I0>;
 
     using GridwiseElementwiseTranspose =
         GridwiseElementwise<Tuple<NGCHWTransposeDescType>,
@@ -533,12 +561,15 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
                       end(a_g_n_k_wos_lengths),
                       begin(output_spatial_lengths_));
 
-            std::array<index_t, NDimSpatial + 3> b_g_n_c_wis_strides_transposed =
-                conv_ngchw_to_nhwgc_transformer.TransposeStrides(b_g_n_c_wis_lengths,
-                                                                 b_g_n_c_wis_strides);
             std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_strides_transposed =
-                conv_ngchw_to_nhwgc_transformer.TransposeStrides(a_g_n_k_wos_lengths,
-                                                                 a_g_n_k_wos_strides);
+                conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(a_g_n_k_wos_lengths,
+                                                                      a_g_n_k_wos_strides);
+            std::array<index_t, NDimSpatial + 3> b_g_n_c_wis_strides_transposed =
+                conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(b_g_n_c_wis_lengths,
+                                                                      b_g_n_c_wis_strides);
+            std::array<index_t, NDimSpatial + 3> e_g_k_c_xs_strides_transposed =
+                conv_ngchw_to_nhwgc_transformer.TransposeWeiStrides(e_g_k_c_xs_lengths,
+                                                                    e_g_k_c_xs_strides);
 
             const auto descs =
                 conv_to_gemm_transformer_v2
@@ -550,7 +581,7 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
                         filter_spatial_lengths_,
                         output_spatial_lengths_,
                         b_g_n_c_wis_strides_transposed,
-                        e_g_k_c_xs_strides,
+                        e_g_k_c_xs_strides_transposed,
                         a_g_n_k_wos_strides_transposed,
                         conv_filter_strides,
                         conv_filter_dilations,
@@ -580,29 +611,21 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
                         input_right_pads,
                         k_batch_)[I2];
 
-            elementwise_block_2_ctile_map_ = Block2TileMapElementwise{
-                ce_grid_desc_m_n_.GetLength(I0), ce_grid_desc_m_n_.GetLength(I1)};
-
             const index_t GemmM = a_grid_desc_k0_m_k1_.GetLength(I1);
             const index_t GemmN = b_grid_desc_k0_n_k1_.GetLength(I1);
 
             // A/B/C Batch Stride
             compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides_transposed[0];
             compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_n_c_wis_strides_transposed[0];
-            compute_ptr_offset_of_batch_.BatchStrideC_ =
-                Conv_K_ * Conv_C_ *
-                std::accumulate(begin(filter_spatial_lengths_),
-                                end(filter_spatial_lengths_),
-                                index_t{1},
-                                std::multiplies<>{});
+            compute_ptr_offset_of_batch_.BatchStrideC_ = e_g_k_c_xs_strides_transposed[0];
             c_grid_desc_mblock_mperblock_nblock_nperblock_ =
                 GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                     ce_grid_desc_m_n_,
                     GridwiseGemm::CalculateMBlock(GemmM),
                     GridwiseGemm::CalculateNBlock(GemmN));
 
-            if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>())
+            if constexpr(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>())
             {
                 a_in_transpose_desc_ =
                     conv_ngchw_to_nhwgc_transformer.template MakeNGCHWTransposeDesc<NDimSpatial>(
@@ -618,17 +641,35 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
                     conv_ngchw_to_nhwgc_transformer.template MakeNHWGCTransposeDesc<NDimSpatial>(
                         b_g_n_c_wis_lengths, b_g_n_c_wis_strides);
 
+                e_in_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeGKYXCTransposeDesc<NDimSpatial>(
+                        e_g_k_c_xs_lengths, e_g_k_c_xs_strides);
+                e_out_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeGKCYXTransposeDesc<NDimSpatial>(
+                        e_g_k_c_xs_lengths, e_g_k_c_xs_strides);
+
                 elementwise_block_2_ctile_map_transpose_a_ = Block2TileMapElementwise{
                     a_in_transpose_desc_.GetLength(I0), a_in_transpose_desc_.GetLength(I1)};
 
                 elementwise_block_2_ctile_map_transpose_b_ = Block2TileMapElementwise{
                     b_in_transpose_desc_.GetLength(I0), b_in_transpose_desc_.GetLength(I1)};
             }
+
+            elementwise_block_2_ctile_map_ =
+                is_NGCHW_GKCYX_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                        is_NGCDHW_GKCZYX_NGKDHW<InLayout, WeiLayout, OutLayout>()
+                    ? Block2TileMapElementwise{e_in_transpose_desc_.GetLength(I0),
+                                               e_in_transpose_desc_.GetLength(I1)}
+                    : Block2TileMapElementwise{ce_grid_desc_m_n_.GetLength(I0),
+                                               ce_grid_desc_m_n_.GetLength(I1)};
         }
 
         std::size_t GetWorkspaceATensorSizeBytes() const
         {
-            return sizeof(ADataType) * a_in_transpose_desc_.GetElementSpaceSize();
+            // Align to 128B
+            return math::integer_divide_ceil(
+                       sizeof(ADataType) * a_in_transpose_desc_.GetElementSpaceSize(), 128) *
+                   128;
         }
 
         std::size_t GetWorkspaceBTensorSizeBytes() const
@@ -638,14 +679,23 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
 
         std::size_t GetWorkspaceETensorSizeBytes() const
         {
-            return sizeof(AccDataType) * ce_grid_desc_m_n_.GetElementSpaceSize() * Conv_G_;
+            // Align to 128B
+            return math::integer_divide_ceil(sizeof(AccDataType) *
+                                                 ce_grid_desc_m_n_.GetElementSpaceSize() * Conv_G_,
+                                             128) *
+                   128;
         }
 
         std::size_t GetWorkspaceSizeBytes() const
         {
-            // Transpose require workspace for A and B
-            if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>())
+            // 1. We need to transpose A and B for NGCHW and NGKHW layouts
+            // 2. If C format is GKCYX then tranpose during second stage.
+            //    If C format is GKYXC then just perform second stage.
+            //    Due to the fact that E workspace is always needed, we
+            //    allocate them as the first part of the workspace.
+            //    [EWorkspace, AWorkspace, BWorkspace]
+            if constexpr(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>())
             {
                 return GetWorkspaceATensorSizeBytes() + GetWorkspaceBTensorSizeBytes() +
                        GetWorkspaceETensorSizeBytes();
@@ -672,6 +722,8 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
 
         NGCHWTransposeDescType a_in_transpose_desc_, b_in_transpose_desc_;
         NHWGCTransposeDescType a_out_transpose_desc_, b_out_transpose_desc_;
+        GKYXCTransposeDescType e_in_transpose_desc_;
+        GKCYXTransposeDescType e_out_transpose_desc_;
 
         // for computing batch offset
         ComputePtrOffsetOfStridedBatch<I1, I1, I0> compute_ptr_offset_of_batch_;
@@ -728,11 +780,11 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
             const ADataType* p_a_grid = arg.p_a_grid_;
             const BDataType* p_b_grid = arg.p_b_grid_;
 
-            if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>())
+            if constexpr(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>())
             {
                 p_a_grid = type_convert<const ADataType*>(arg.p_workspace_) +
-                           arg.GetWorkspaceETensorSizeBytes() / sizeof(BDataType);
+                           arg.GetWorkspaceETensorSizeBytes() / sizeof(ADataType);
                 p_b_grid =
                     type_convert<const BDataType*>(arg.p_workspace_) +
                     (arg.GetWorkspaceETensorSizeBytes() + arg.GetWorkspaceATensorSizeBytes()) /
@@ -1373,41 +1425,72 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
             float avg_time                 = 0.f;
             auto launch_elementwise_kernel = [&]() {
                 const AccDataType* p_c_grid = type_convert<const AccDataType*>(arg.p_workspace_);
-                const index_t grid_size     = arg.elementwise_block_2_ctile_map_.CalculateGridSize(
-                                              arg.ce_elementwise_grid_desc_m_n_) *
-                                          arg.Conv_G_;
 
                 std::array<index_t, I1> in_out_batch_strides = {
                     static_cast<index_t>(arg.compute_ptr_offset_of_batch_.BatchStrideC_)};
 
-                const auto kernel = kernel_batched_elementwise<GridwiseElementwiseCast,
-                                                               ck::Tuple<CElementwiseGridDesc_M_N>,
-                                                               ck::Tuple<CElementwiseGridDesc_M_N>,
-                                                               ck::Tuple<const AccDataType*>,
-                                                               ck::Tuple<EDataType*>,
-                                                               Block2TileMapElementwise,
-                                                               CDEElementwiseOperation,
-                                                               I1,
-                                                               I1>;
+                if constexpr(is_NGCHW_GKCYX_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                             is_NGCDHW_GKCZYX_NGKDHW<InLayout, WeiLayout, OutLayout>())
+                {
+                    const index_t grid_size = arg.elementwise_block_2_ctile_map_.CalculateGridSize(
+                        arg.e_in_transpose_desc_);
 
-                return launch_and_time_kernel(stream_config,
-                                              kernel,
-                                              dim3(grid_size),
-                                              dim3(BlockSize),
-                                              0,
-                                              make_tuple(arg.ce_elementwise_grid_desc_m_n_),
-                                              make_tuple(arg.ce_elementwise_grid_desc_m_n_),
-                                              make_tuple(p_c_grid),
-                                              make_tuple(arg.p_e_grid_),
-                                              arg.elementwise_block_2_ctile_map_,
-                                              arg.cde_element_op_,
-                                              arg.Conv_G_,
-                                              in_out_batch_strides,
-                                              in_out_batch_strides);
+                    const auto kernel = kernel_elementwise<GridwiseElementwiseWeightTransposeCast,
+                                                           ck::Tuple<GKYXCTransposeDescType>,
+                                                           ck::Tuple<GKCYXTransposeDescType>,
+                                                           ck::Tuple<const AccDataType*>,
+                                                           ck::Tuple<EDataType*>,
+                                                           Block2TileMapElementwise,
+                                                           CDEElementwiseOperation>;
+
+                    return launch_and_time_kernel(stream_config,
+                                                  kernel,
+                                                  dim3(grid_size),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  make_tuple(arg.e_in_transpose_desc_),
+                                                  make_tuple(arg.e_out_transpose_desc_),
+                                                  make_tuple(p_c_grid),
+                                                  make_tuple(arg.p_e_grid_),
+                                                  arg.elementwise_block_2_ctile_map_,
+                                                  arg.cde_element_op_);
+                }
+                else
+                {
+                    const index_t grid_size = arg.elementwise_block_2_ctile_map_.CalculateGridSize(
+                                                  arg.ce_elementwise_grid_desc_m_n_) *
+                                              arg.Conv_G_;
+
+                    const auto kernel =
+                        kernel_batched_elementwise<GridwiseElementwiseCast,
+                                                   ck::Tuple<CElementwiseGridDesc_M_N>,
+                                                   ck::Tuple<CElementwiseGridDesc_M_N>,
+                                                   ck::Tuple<const AccDataType*>,
+                                                   ck::Tuple<EDataType*>,
+                                                   Block2TileMapElementwise,
+                                                   CDEElementwiseOperation,
+                                                   I1,
+                                                   I1>;
+
+                    return launch_and_time_kernel(stream_config,
+                                                  kernel,
+                                                  dim3(grid_size),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  make_tuple(arg.ce_elementwise_grid_desc_m_n_),
+                                                  make_tuple(arg.ce_elementwise_grid_desc_m_n_),
+                                                  make_tuple(p_c_grid),
+                                                  make_tuple(arg.p_e_grid_),
+                                                  arg.elementwise_block_2_ctile_map_,
+                                                  arg.cde_element_op_,
+                                                  arg.Conv_G_,
+                                                  in_out_batch_strides,
+                                                  in_out_batch_strides);
+                }
             };
 
-            if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>())
+            if constexpr(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>())
             {
                 const index_t grid_size_a =
                     arg.elementwise_block_2_ctile_map_transpose_a_.CalculateGridSize(
@@ -1417,7 +1500,7 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
                         arg.b_in_transpose_desc_);
 
                 ADataType* p_a_out_grid = type_convert<ADataType*>(arg.p_workspace_) +
-                                          arg.GetWorkspaceETensorSizeBytes() / sizeof(BDataType);
+                                          arg.GetWorkspaceETensorSizeBytes() / sizeof(ADataType);
                 BDataType* p_b_out_grid =
                     type_convert<BDataType*>(arg.p_workspace_) +
                     (arg.GetWorkspaceETensorSizeBytes() + arg.GetWorkspaceATensorSizeBytes()) /
@@ -1425,11 +1508,14 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
 
                 // Different data type for A and B is not supported
                 auto kernel_transpose = kernel_elementwise_dual<GridwiseElementwiseTranspose,
+                                                                GridwiseElementwiseTranspose,
                                                                 ck::Tuple<NGCHWTransposeDescType>,
                                                                 ck::Tuple<NGCHWTransposeDescType>,
                                                                 ck::Tuple<NHWGCTransposeDescType>,
                                                                 ck::Tuple<NHWGCTransposeDescType>,
                                                                 ck::Tuple<const ADataType*>,
+                                                                ck::Tuple<const ADataType*>,
+                                                                ck::Tuple<ADataType*>,
                                                                 ck::Tuple<ADataType*>,
                                                                 Block2TileMapElementwise,
                                                                 Block2TileMapElementwise,
@@ -1511,7 +1597,7 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
         if constexpr(NDimSpatial == 2)
         {
             if constexpr(!(is_NHWGC_GKYXC_NHWGK<InLayout, WeiLayout, OutLayout>() ||
-                           is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>()))
+                           is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>()))
             {
                 return false;
             }
@@ -1519,7 +1605,7 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
         else if constexpr(NDimSpatial == 3)
         {
             if constexpr(!(is_NDHWGC_GKZYXC_NDHWGK<InLayout, WeiLayout, OutLayout>() ||
-                           is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>()))
+                           is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>()))
             {
                 return false;
             }
@@ -1594,8 +1680,8 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
             return false;
         }
 
-        if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
-                     is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>())
+        if constexpr(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                     is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>())
         {
             if((arg.Conv_G_ * arg.Conv_C_) % TransposeTransferDstScalarPerVector != 0)
             {
@@ -1621,6 +1707,13 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
             {
                 return false;
             }
+
+            constexpr long_index_t TwoGB = (long_index_t{1} << 31);
+            if(!(arg.a_out_transpose_desc_.GetElementSpaceSize() * sizeof(ADataType) <= TwoGB &&
+                 arg.b_out_transpose_desc_.GetElementSpaceSize() * sizeof(BDataType) <= TwoGB))
+            {
+                return false;
+            }
         }
 
         return true;
@@ -1757,8 +1850,8 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
             << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
             << NumGroupsToMerge;
             
-        if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() || 
-                        is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>()) {
+        if constexpr(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() || 
+                        is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>()) {
                 str << ", TransposeTransferSrcScalarPerVector: "
                 << TransposeTransferSrcScalarPerVector <<", "
                 << "TransposeTransferDstScalarPerVector: " << TransposeTransferDstScalarPerVector;
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
index abd6a080aa..c904b4e7d5 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
@@ -8,6 +8,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -165,8 +166,8 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
     using CDataType = WeiDataType;
 
     // If NGCHW then ADataType must be equal to BDataType
-    static_assert(!(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
-                    is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>()) ||
+    static_assert(!(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                    is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>()) ||
                   is_same_v<ADataType, BDataType>);
 
     using AElementwiseOperation = OutElementwiseOperation;
@@ -301,7 +302,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                                   MPerBlock / ClusterLengthMPerBlock,
                                   NPerBlock / ClusterLengthNPerBlock>{};
 
-    using Block2TileMapElementwise = BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock>;
+    using Block2TileMapTranspose = BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock>;
 
     static constexpr index_t TransposeTransferSrcScalarPerVectorAligned =
         std::min(NPerBlock / ClusterLengthNPerBlock, MaxTransposeTransferSrcScalarPerVector);
@@ -314,13 +315,19 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
     using NHWGCTransposeDescType =
         remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
                                     .template MakeNHWGCTransposeDesc<NDimSpatial>({}, {}))>;
+    using GKCYXTransposeDescType =
+        remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
+                                    .template MakeGKCYXTransposeDesc<NDimSpatial>({}, {}))>;
+    using GKYXCTransposeDescType =
+        remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
+                                    .template MakeGKYXCTransposeDesc<NDimSpatial>({}, {}))>;
 
-    using GridwiseElementwiseTranspose =
+    using GridwiseInOutTranspose =
         GridwiseElementwise<Tuple<NGCHWTransposeDescType>,
                             Tuple<NHWGCTransposeDescType>,
                             Tuple<const ADataType*>,
                             Tuple<ADataType*>,
-                            Block2TileMapElementwise,
+                            Block2TileMapTranspose,
                             element_wise::PassThrough,
                             BlockSize,
                             MPerBlock,
@@ -333,6 +340,26 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                             I1,
                             I0>;
 
+    // NPerBlock is used for the first dim which is store dimension
+    // (with CBlockTransferScalarPerVector_NWaveNPerXdl scalar per vector).
+    using GridwiseElementwiseWeightTranspose =
+        GridwiseElementwise<Tuple<GKYXCTransposeDescType>,
+                            Tuple<GKCYXTransposeDescType>,
+                            Tuple<const CDataType*>,
+                            Tuple<CDataType*>,
+                            Block2TileMapTranspose,
+                            element_wise::PassThrough,
+                            BlockSize,
+                            MPerBlock,
+                            NPerBlock,
+                            MPerBlock / ClusterLengthMPerBlock,
+                            NPerBlock / ClusterLengthNPerBlock,
+                            Sequence<1, 0>,
+                            Sequence<CBlockTransferScalarPerVector_NWaveNPerXdl>,
+                            Sequence<1>,
+                            I1,
+                            I0>;
+
     using GridwiseGemm = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight<
         BlockSize,
         ADataType,
@@ -452,13 +479,15 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                       end(a_g_n_k_wos_lengths),
                       begin(output_spatial_lengths_));
 
-            std::array<index_t, NDimSpatial + 3> b_g_n_c_wis_strides_transposed =
-                conv_ngchw_to_nhwgc_transformer.TransposeStrides(b_g_n_c_wis_lengths,
-                                                                 b_g_n_c_wis_strides);
             std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_strides_transposed =
-                conv_ngchw_to_nhwgc_transformer.TransposeStrides(a_g_n_k_wos_lengths,
-                                                                 a_g_n_k_wos_strides);
-
+                conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(a_g_n_k_wos_lengths,
+                                                                      a_g_n_k_wos_strides);
+            std::array<index_t, NDimSpatial + 3> b_g_n_c_wis_strides_transposed =
+                conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(b_g_n_c_wis_lengths,
+                                                                      b_g_n_c_wis_strides);
+            std::array<index_t, NDimSpatial + 3> e_g_k_c_xs_strides_transposed =
+                conv_ngchw_to_nhwgc_transformer.TransposeWeiStrides(e_g_k_c_xs_lengths,
+                                                                    e_g_k_c_xs_strides);
             const auto descs =
                 conv_to_gemm_transformer
                     .template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<NDimSpatial>(
@@ -469,7 +498,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                         filter_spatial_lengths_,
                         output_spatial_lengths_,
                         b_g_n_c_wis_strides_transposed,
-                        e_g_k_c_xs_strides,
+                        e_g_k_c_xs_strides_transposed,
                         a_g_n_k_wos_strides_transposed,
                         conv_filter_strides,
                         conv_filter_dilations,
@@ -487,12 +516,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
             // A/B/C Batch Stride
             compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides_transposed[0];
             compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_n_c_wis_strides_transposed[0];
-            compute_ptr_offset_of_batch_.BatchStrideC_ =
-                Conv_K_ * Conv_C_ *
-                std::accumulate(begin(filter_spatial_lengths_),
-                                end(filter_spatial_lengths_),
-                                index_t{1},
-                                std::multiplies<>{});
+            compute_ptr_offset_of_batch_.BatchStrideC_ = e_g_k_c_xs_strides_transposed[0];
 
             if(GridwiseGemm::CheckValidity(a_grid_desc_kbatch_k0_m_k1_,
                                            b_grid_desc_kbatch_k0_n_k1_,
@@ -503,8 +527,8 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                     GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n_);
             }
 
-            if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>())
+            if constexpr(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>())
             {
                 a_in_transpose_desc_ =
                     conv_ngchw_to_nhwgc_transformer.template MakeNGCHWTransposeDesc<NDimSpatial>(
@@ -520,31 +544,33 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                     conv_ngchw_to_nhwgc_transformer.template MakeNHWGCTransposeDesc<NDimSpatial>(
                         b_g_n_c_wis_lengths, b_g_n_c_wis_strides);
 
-                elementwise_block_2_ctile_map_transpose_a_ = Block2TileMapElementwise{
+                e_in_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeGKYXCTransposeDesc<NDimSpatial>(
+                        e_g_k_c_xs_lengths, e_g_k_c_xs_strides);
+                e_out_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeGKCYXTransposeDesc<NDimSpatial>(
+                        e_g_k_c_xs_lengths, e_g_k_c_xs_strides);
+
+                elementwise_block_2_ctile_map_transpose_a_ = Block2TileMapTranspose{
                     a_in_transpose_desc_.GetLength(I0), a_in_transpose_desc_.GetLength(I1)};
 
-                elementwise_block_2_ctile_map_transpose_b_ = Block2TileMapElementwise{
+                elementwise_block_2_ctile_map_transpose_b_ = Block2TileMapTranspose{
                     b_in_transpose_desc_.GetLength(I0), b_in_transpose_desc_.GetLength(I1)};
+
+                elementwise_block_2_ctile_map_transpose_e_ = Block2TileMapTranspose{
+                    e_in_transpose_desc_.GetLength(I0), e_in_transpose_desc_.GetLength(I1)};
             }
         }
 
         std::size_t GetWorkspaceATensorSizeBytes() const
         {
-            return sizeof(ADataType) * a_in_transpose_desc_.GetElementSpaceSize();
-        }
-
-        std::size_t GetWorkspaceBTensorSizeBytes() const
-        {
-            return sizeof(BDataType) * b_in_transpose_desc_.GetElementSpaceSize();
-        }
-
-        std::size_t GetWorkspaceSizeBytes() const
-        {
-            // Transpose require workspace for A and B
-            if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>())
+            if constexpr(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>())
             {
-                return GetWorkspaceATensorSizeBytes() + GetWorkspaceBTensorSizeBytes();
+                // Align to 128B
+                return math::integer_divide_ceil(
+                           sizeof(ADataType) * a_in_transpose_desc_.GetElementSpaceSize(), 128) *
+                       128;
             }
             else
             {
@@ -552,6 +578,41 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
             }
         }
 
+        std::size_t GetWorkspaceBTensorSizeBytes() const
+        {
+            if constexpr(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>())
+            {
+                // Align to 128B
+                return math::integer_divide_ceil(
+                           sizeof(BDataType) * b_in_transpose_desc_.GetElementSpaceSize(), 128) *
+                       128;
+            }
+            else
+            {
+                return 0;
+            }
+        }
+
+        std::size_t GetWorkspaceETensorSizeBytes() const
+        {
+            if constexpr(is_NGCHW_GKCYX_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_GKCZYX_NGKDHW<InLayout, WeiLayout, OutLayout>())
+            {
+                return sizeof(CDataType) * e_in_transpose_desc_.GetElementSpaceSize();
+            }
+            else
+            {
+                return 0;
+            }
+        }
+
+        std::size_t GetWorkspaceSizeBytes() const
+        {
+            return GetWorkspaceATensorSizeBytes() + GetWorkspaceBTensorSizeBytes() +
+                   GetWorkspaceETensorSizeBytes();
+        }
+
         const ADataType* p_a_grid_;
         const BDataType* p_b_grid_;
         CDataType* p_c_grid_;
@@ -562,12 +623,15 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
 
         Block2CTileMap block_2_ctile_map_;
 
-        Block2TileMapElementwise elementwise_block_2_ctile_map_transpose_a_,
-            elementwise_block_2_ctile_map_transpose_b_;
+        Block2TileMapTranspose elementwise_block_2_ctile_map_transpose_a_,
+            elementwise_block_2_ctile_map_transpose_b_, elementwise_block_2_ctile_map_transpose_e_;
 
         NGCHWTransposeDescType a_in_transpose_desc_, b_in_transpose_desc_;
         NHWGCTransposeDescType a_out_transpose_desc_, b_out_transpose_desc_;
 
+        GKYXCTransposeDescType e_in_transpose_desc_;
+        GKCYXTransposeDescType e_out_transpose_desc_;
+
         // for computing batch offset
         ComputePtrOffsetOfStridedBatch<> compute_ptr_offset_of_batch_;
 
@@ -621,9 +685,19 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
 
             const ADataType* p_a_grid = arg.p_a_grid_;
             const BDataType* p_b_grid = arg.p_b_grid_;
+            CDataType* p_e_grid       = arg.p_c_grid_;
 
-            if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>())
+            if constexpr(is_NGCHW_GKCYX_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_GKCZYX_NGKDHW<InLayout, WeiLayout, OutLayout>())
+            {
+                p_e_grid =
+                    type_convert<CDataType*>(arg.p_workspace_) +
+                    (arg.GetWorkspaceATensorSizeBytes() + arg.GetWorkspaceBTensorSizeBytes()) /
+                        sizeof(CDataType);
+            }
+
+            if constexpr(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>())
             {
                 const index_t grid_size_a =
                     arg.elementwise_block_2_ctile_map_transpose_a_.CalculateGridSize(
@@ -640,15 +714,18 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                                           arg.GetWorkspaceATensorSizeBytes() / sizeof(BDataType);
 
                 // Different data type for A and B is not supported
-                auto kernel_transpose = kernel_elementwise_dual<GridwiseElementwiseTranspose,
+                auto kernel_transpose = kernel_elementwise_dual<GridwiseInOutTranspose,
+                                                                GridwiseInOutTranspose,
                                                                 ck::Tuple<NGCHWTransposeDescType>,
                                                                 ck::Tuple<NGCHWTransposeDescType>,
                                                                 ck::Tuple<NHWGCTransposeDescType>,
                                                                 ck::Tuple<NHWGCTransposeDescType>,
                                                                 ck::Tuple<const ADataType*>,
+                                                                ck::Tuple<const ADataType*>,
                                                                 ck::Tuple<ADataType*>,
-                                                                Block2TileMapElementwise,
-                                                                Block2TileMapElementwise,
+                                                                ck::Tuple<ADataType*>,
+                                                                Block2TileMapTranspose,
+                                                                Block2TileMapTranspose,
                                                                 element_wise::PassThrough>;
 
                 avg_time += launch_and_time_kernel(stream_config,
@@ -695,24 +772,36 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                     ComputePtrOffsetOfStridedBatch<>,
                     has_main_loop>;
 
-                avg_time +=
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           p_a_grid,
-                                           p_b_grid,
-                                           arg.p_c_grid_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.Conv_G_,
-                                           arg.a_grid_desc_kbatch_k0_m_k1_,
-                                           arg.b_grid_desc_kbatch_k0_n_k1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.block_2_ctile_map_,
-                                           arg.compute_ptr_offset_of_batch_);
+                const auto clear_workspace = [&]() {
+                    if constexpr(is_NGCHW_GKCYX_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                                 is_NGCDHW_GKCZYX_NGKDHW<InLayout, WeiLayout, OutLayout>())
+                    {
+                        hip_check_error(hipMemsetAsync(p_e_grid,
+                                                       0,
+                                                       arg.GetWorkspaceETensorSizeBytes(),
+                                                       stream_config.stream_id_));
+                    }
+                };
+
+                avg_time += launch_and_time_kernel_with_preprocess(
+                    stream_config,
+                    clear_workspace,
+                    kernel,
+                    dim3(grid_size),
+                    dim3(BlockSize),
+                    0,
+                    p_a_grid,
+                    p_b_grid,
+                    p_e_grid,
+                    arg.a_element_op_,
+                    arg.b_element_op_,
+                    arg.c_element_op_,
+                    arg.Conv_G_,
+                    arg.a_grid_desc_kbatch_k0_m_k1_,
+                    arg.b_grid_desc_kbatch_k0_n_k1_,
+                    arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                    arg.block_2_ctile_map_,
+                    arg.compute_ptr_offset_of_batch_);
             };
 
             if(has_main_k0_block_loop)
@@ -723,6 +812,38 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
             {
                 launch_kernel(integral_constant<bool, false>{});
             }
+
+            if constexpr(is_NGCHW_GKCYX_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_GKCZYX_NGKDHW<InLayout, WeiLayout, OutLayout>())
+            {
+                const index_t grid_size_e =
+                    arg.elementwise_block_2_ctile_map_transpose_e_.CalculateGridSize(
+                        arg.e_in_transpose_desc_);
+
+                const CDataType* p_e_in_grid = static_cast<const CDataType*>(p_e_grid);
+
+                // Different data type for A and B is not supported
+                auto kernel_transpose = kernel_elementwise<GridwiseElementwiseWeightTranspose,
+                                                           ck::Tuple<GKYXCTransposeDescType>,
+                                                           ck::Tuple<GKCYXTransposeDescType>,
+                                                           ck::Tuple<const CDataType*>,
+                                                           ck::Tuple<CDataType*>,
+                                                           Block2TileMapTranspose,
+                                                           element_wise::PassThrough>;
+
+                avg_time += launch_and_time_kernel(stream_config,
+                                                   kernel_transpose,
+                                                   dim3(grid_size_e),
+                                                   dim3(BlockSize),
+                                                   0,
+                                                   make_tuple(arg.e_in_transpose_desc_),
+                                                   make_tuple(arg.e_out_transpose_desc_),
+                                                   make_tuple(p_e_in_grid),
+                                                   make_tuple(arg.p_c_grid_),
+                                                   arg.elementwise_block_2_ctile_map_transpose_e_,
+                                                   element_wise::PassThrough{});
+            }
+
             return avg_time;
         }
 
@@ -760,7 +881,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
         {
             if constexpr(!(is_NHWGC_GKYXC_NHWGK<InLayout, WeiLayout, OutLayout>() ||
                            is_GNHWC_GKYXC_GNHWK<InLayout, WeiLayout, OutLayout>() ||
-                           is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>()))
+                           is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>()))
             {
                 return false;
             }
@@ -769,7 +890,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
         {
             if constexpr(!(is_NDHWGC_GKZYXC_NDHWGK<InLayout, WeiLayout, OutLayout>() ||
                            is_GNDHWC_GKZYXC_GNDHWK<InLayout, WeiLayout, OutLayout>() ||
-                           is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>()))
+                           is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>()))
             {
                 return false;
             }
@@ -807,8 +928,8 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
             return false;
         }
 
-        if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
-                     is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>())
+        if constexpr(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                     is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>())
         {
             if((arg.Conv_G_ * arg.Conv_C_) % TransposeTransferDstScalarPerVectorAligned != 0)
             {
@@ -834,6 +955,25 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
             {
                 return false;
             }
+
+            if(!arg.p_workspace_)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Warning: Workspace for "
+                                 "DeviceGroupedConvBwdWeight_Xdl_CShuffle::Argument is not "
+                                 "allocated, use SetWorkSpacePointer."
+                              << std::endl;
+                }
+                return false;
+            }
+
+            constexpr long_index_t TwoGB = (long_index_t{1} << 31);
+            if(!(arg.a_out_transpose_desc_.GetElementSpaceSize() * sizeof(ADataType) <= TwoGB &&
+                 arg.b_out_transpose_desc_.GetElementSpaceSize() * sizeof(BDataType) <= TwoGB))
+            {
+                return false;
+            }
         }
 
         // Gridwise GEMM size
@@ -958,8 +1098,8 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
             << CShuffleNXdlPerWavePerShuffle << ", "
             << CBlockTransferScalarPerVector_NWaveNPerXdl;
 
-        if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() || 
-                        is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>()) {
+        if constexpr(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() || 
+                        is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>()) {
                 str << ", TransposeTransferSrcScalarPerVectorAligned: "
                 << TransposeTransferSrcScalarPerVectorAligned <<", "
                 << "TransposeTransferDstScalarPerVectorAligned: " << TransposeTransferDstScalarPerVectorAligned;
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
index f6ea23a1e7..dd5b97096d 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
@@ -17,7 +17,7 @@
 #include "ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp"
 #include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_bwd_weight_v3.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp"
 #include <ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp>
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
@@ -81,6 +81,11 @@ __global__ void
                                         k_idx);
 #else
     ignore = karg;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = compute_ptr_offset_of_batch;
+    ignore = num_k_per_block;
 #endif // end of if (defined(__gfx9__)
 }
 
@@ -140,6 +145,11 @@ __global__ void
                                              k_idx);
 #else
     ignore = karg;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = compute_ptr_offset_of_batch;
+    ignore = num_k_per_block;
 #endif // end of if (defined(__gfx9__)
 }
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
index 02ca8f42e4..c0148c3b9c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
@@ -11,6 +11,7 @@
 
 #include "ck/library/utility/numeric.hpp"
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -314,8 +315,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
     static constexpr bool isMultiB = is_detected<is_tuple, BDataType>::value;
 
     // NGCHW is not supported for multiAB
-    static_assert(!(is_NGCHW_GKYXC_NGKHW<ALayout, BLayout, ELayout>() ||
-                    is_NGCDHW_GKZYXC_NGKDHW<ALayout, BLayout, ELayout>()) ||
+    static_assert(!(is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
+                    is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>()) ||
                   !(isMultiA || isMultiB));
 
     static constexpr index_t NumATensor = GetNumABTensors<isMultiA, ADataType>();
@@ -355,11 +356,9 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
     {
         namespace ctc = tensor_layout::convolution;
         using Layout  = std::conditional_t<
-            is_NGCHW_GKYXC_NGKHW<ALayout, BLayout, ELayout>(),
+            is_NGCHW_NGKHW<ALayout, BLayout, ELayout>(),
             ctc::NHWGC,
-            std::conditional_t<is_NGCDHW_GKZYXC_NGKDHW<ALayout, BLayout, ELayout>(),
-                               ctc::NDHWGC,
-                               ALay>>;
+            std::conditional_t<is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>(), ctc::NDHWGC, ALay>>;
 
         const auto in_gemmmraw_gemmkraw_desc =
             conv_to_gemm_transformer.template MakeADescriptor_M_K<Layout>();
@@ -373,8 +372,14 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
     template <typename BLay>
     static auto MakeBGridDescriptor_N_K(const ConvToGemmFwdTransformer& conv_to_gemm_transformer)
     {
+        namespace ctc = tensor_layout::convolution;
+        using Layout  = std::conditional_t<
+            is_NGCHW_NGKHW<ALayout, BLayout, ELayout>(),
+            ctc::GKYXC,
+            std::conditional_t<is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>(), ctc::GKZYXC, BLay>>;
+
         const auto wei_gemmnraw_gemmkraw_desc =
-            conv_to_gemm_transformer.template MakeBDescriptor_N_K<BLay>();
+            conv_to_gemm_transformer.template MakeBDescriptor_N_K<Layout>();
 
         const auto wei_gemmn_gemmk_desc =
             matrix_padder.PadBDescriptor_N_K(wei_gemmnraw_gemmkraw_desc);
@@ -387,11 +392,9 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
     {
         namespace ctc = tensor_layout::convolution;
         using Layout  = std::conditional_t<
-            is_NGCHW_GKYXC_NGKHW<ALayout, BLayout, ELayout>(),
+            is_NGCHW_NGKHW<ALayout, BLayout, ELayout>(),
             ctc::NHWGK,
-            std::conditional_t<is_NGCDHW_GKZYXC_NGKDHW<ALayout, BLayout, ELayout>(),
-                               ctc::NDHWGK,
-                               ELay>>;
+            std::conditional_t<is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>(), ctc::NDHWGK, ELay>>;
 
         const auto out_gemmmraw_gemmnraw_desc =
             conv_to_gemm_transformer.template MakeCDescriptor_M_N<Layout>();
@@ -491,8 +494,20 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
         remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
                                     .template MakeNHWGCTransposeDesc<NDimSpatial>({}, {}))>;
 
+    using GKCYXTransposeDescType =
+        remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
+                                    .template MakeGKCYXTransposeDesc<NDimSpatial>({}, {}))>;
+    using GKYXCTransposeDescType =
+        remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
+                                    .template MakeGKYXCTransposeDesc<NDimSpatial>({}, {}))>;
+
     static constexpr index_t ElementwiseBlocksize = ClusterLengthNPerBlock * ClusterLengthNPerBlock;
 
+    // NPerBlock is used for the first and second dim which to use
+    // CDEBlockTransferScalarPerVector_NPerBlock for load and store during
+    // transposition. CBlockTransferScalarPerVector_NWaveNPerXdl is aligned to
+    // NPerBlock so it is more flexible to use this dim for load store dimension
+    // with such scalar per vector.
     using GridwiseElementwiseInputTranspose =
         GridwiseElementwise<Tuple<NGCHWTransposeDescType>,
                             Tuple<NHWGCTransposeDescType>,
@@ -511,6 +526,24 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                             I1,
                             I0>;
 
+    using GridwiseElementwiseWeightTranspose =
+        GridwiseElementwise<Tuple<GKCYXTransposeDescType>,
+                            Tuple<GKYXCTransposeDescType>,
+                            Tuple<const BDataType*>,
+                            Tuple<BDataType*>,
+                            Block2TileMapElementwise,
+                            element_wise::PassThrough,
+                            ElementwiseBlocksize,
+                            NPerBlock,
+                            NPerBlock,
+                            NPerBlock / ClusterLengthNPerBlock,
+                            NPerBlock / ClusterLengthNPerBlock,
+                            Sequence<1, 0>,
+                            Sequence<1>,
+                            Sequence<CDEBlockTransferScalarPerVector_NPerBlock>,
+                            I0,
+                            I1>;
+
     using GridwiseElementwiseOutputTranspose =
         GridwiseElementwise<Tuple<NHWGCTransposeDescType>,
                             Tuple<NGCHWTransposeDescType>,
@@ -558,14 +591,15 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
               p_ds_grid_{},
               p_e_grid_{static_cast<EDataType*>(p_e)},
               a_g_n_c_wis_lengths_{a_g_n_c_wis_lengths},
-              a_g_n_c_wis_strides_{conv_ngchw_to_nhwgc_transformer.TransposeStrides(
+              a_g_n_c_wis_strides_{conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(
                   a_g_n_c_wis_lengths, a_g_n_c_wis_strides)},
               b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths},
-              b_g_k_c_xs_strides_{b_g_k_c_xs_strides},
+              b_g_k_c_xs_strides_{conv_ngchw_to_nhwgc_transformer.TransposeWeiStrides(
+                  b_g_k_c_xs_lengths, b_g_k_c_xs_strides)},
               ds_g_n_k_wos_lengths_{ds_g_n_k_wos_lengths},
               ds_g_n_k_wos_strides_{ds_g_n_k_wos_strides},
               e_g_n_k_wos_lengths_{e_g_n_k_wos_lengths},
-              e_g_n_k_wos_strides_{conv_ngchw_to_nhwgc_transformer.TransposeStrides(
+              e_g_n_k_wos_strides_{conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(
                   e_g_n_k_wos_lengths, e_g_n_k_wos_strides)},
               conv_filter_strides_{conv_filter_strides},
               conv_filter_dilations_{conv_filter_dilations},
@@ -744,8 +778,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                 }
             }
 
-            if constexpr(is_NGCHW_GKYXC_NGKHW<ALayout, BLayout, ELayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<ALayout, BLayout, ELayout>())
+            if constexpr(is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
+                         is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>())
             {
                 // Use not modified base strides
                 a_in_transpose_desc_ =
@@ -755,6 +789,13 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                     conv_ngchw_to_nhwgc_transformer.template MakeNHWGCTransposeDesc<NDimSpatial>(
                         a_g_n_c_wis_lengths, a_g_n_c_wis_strides);
 
+                b_in_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeGKCYXTransposeDesc<NDimSpatial>(
+                        b_g_k_c_xs_lengths, b_g_k_c_xs_strides);
+                b_out_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeGKYXCTransposeDesc<NDimSpatial>(
+                        b_g_k_c_xs_lengths, b_g_k_c_xs_strides);
+
                 e_in_transpose_desc_ =
                     conv_ngchw_to_nhwgc_transformer.template MakeNHWGCTransposeDesc<NDimSpatial>(
                         e_g_n_k_wos_lengths, e_g_n_k_wos_strides);
@@ -764,6 +805,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
 
                 elementwise_block_2_ctile_map_transpose_a_ = Block2TileMapElementwise{
                     a_in_transpose_desc_.GetLength(I0), a_in_transpose_desc_.GetLength(I1)};
+                elementwise_block_2_ctile_map_transpose_b_ = Block2TileMapElementwise{
+                    b_in_transpose_desc_.GetLength(I0), b_in_transpose_desc_.GetLength(I1)};
                 elementwise_block_2_ctile_map_transpose_e_ = Block2TileMapElementwise{
                     e_in_transpose_desc_.GetLength(I0), e_in_transpose_desc_.GetLength(I1)};
             }
@@ -771,21 +814,13 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
 
         std::size_t GetWorkspaceATensorSizeBytes() const
         {
-            return sizeof(ADataType) * a_in_transpose_desc_.GetElementSpaceSize();
-        }
-
-        std::size_t GetWorkspaceETensorSizeBytes() const
-        {
-            return sizeof(EDataType) * e_out_transpose_desc_.GetElementSpaceSize();
-        }
-
-        std::size_t GetWorkspaceSizeBytes() const
-        {
-            // Transpose require workspace for A and B
-            if constexpr(is_NGCHW_GKYXC_NGKHW<ALayout, BLayout, ELayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<ALayout, BLayout, ELayout>())
+            if constexpr(is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
+                         is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>())
             {
-                return GetWorkspaceATensorSizeBytes() + GetWorkspaceETensorSizeBytes();
+                const long_index_t a_acum = ck::accumulate_n<long_index_t>(
+                    a_g_n_c_wis_lengths_.begin(), NDimSpatial + I3, 1, std::multiplies<>());
+                // Align to 128B
+                return math::integer_divide_ceil(sizeof(ADataType) * a_acum, 128) * 128;
             }
             else
             {
@@ -793,6 +828,43 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
             }
         }
 
+        std::size_t GetWorkspaceBTensorSizeBytes() const
+        {
+            if constexpr(is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>() ||
+                         is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>())
+            {
+                const long_index_t b_acum = ck::accumulate_n<long_index_t>(
+                    b_g_k_c_xs_lengths_.begin(), NDimSpatial + I3, 1, std::multiplies<>());
+                // Align to 128B
+                return math::integer_divide_ceil(sizeof(BDataType) * b_acum, 128) * 128;
+            }
+            else
+            {
+                return 0;
+            }
+        }
+
+        std::size_t GetWorkspaceETensorSizeBytes() const
+        {
+            if constexpr(is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
+                         is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>())
+            {
+                const long_index_t e_accum = ck::accumulate_n<long_index_t>(
+                    e_g_n_k_wos_lengths_.begin(), NDimSpatial + I3, 1, std::multiplies<>());
+                return sizeof(EDataType) * e_accum;
+            }
+            else
+            {
+                return 0;
+            }
+        }
+
+        std::size_t GetWorkspaceSizeBytes() const
+        {
+            return GetWorkspaceATensorSizeBytes() + GetWorkspaceBTensorSizeBytes() +
+                   GetWorkspaceETensorSizeBytes();
+        }
+
         void Print() const
         {
             std::cout << "A[M, K]: " << a_grid_desc_m_k_ << std::endl;
@@ -845,10 +917,12 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
         // block-to-e-tile map
         Block2ETileMap block_2_etile_map_;
         Block2TileMapElementwise elementwise_block_2_ctile_map_transpose_a_,
-            elementwise_block_2_ctile_map_transpose_e_;
+            elementwise_block_2_ctile_map_transpose_b_, elementwise_block_2_ctile_map_transpose_e_;
 
         NGCHWTransposeDescType a_in_transpose_desc_, e_out_transpose_desc_;
         NHWGCTransposeDescType a_out_transpose_desc_, e_in_transpose_desc_;
+        GKCYXTransposeDescType b_in_transpose_desc_;
+        GKYXCTransposeDescType b_out_transpose_desc_;
 
         // for computing batch offset
         ComputePtrOffsetOfStridedBatch<NumATensor, NumBTensor, NumDTensor>
@@ -938,14 +1012,28 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                 else
                 {
                     const ADataType* p_a_grid = arg.p_as_grid_.At(I0);
+                    const BDataType* p_b_grid = arg.p_bs_grid_.At(I0);
                     EDataType* p_e_grid       = arg.p_e_grid_;
 
-                    if constexpr(is_NGCHW_GKYXC_NGKHW<ALayout, BLayout, ELayout>() ||
-                                 is_NGCDHW_GKZYXC_NGKDHW<ALayout, BLayout, ELayout>())
+                    if constexpr(is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>() ||
+                                 is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>())
+                    {
+                        p_a_grid = type_convert<const ADataType*>(arg.p_workspace_);
+                        p_b_grid = type_convert<const BDataType*>(arg.p_workspace_) +
+                                   arg.GetWorkspaceATensorSizeBytes() / sizeof(BDataType);
+                        p_e_grid = type_convert<EDataType*>(arg.p_workspace_) +
+                                   (arg.GetWorkspaceATensorSizeBytes() +
+                                    arg.GetWorkspaceBTensorSizeBytes()) /
+                                       sizeof(EDataType);
+                    }
+                    else if constexpr(is_NGCHW_GKYXC_NGKHW<ALayout, BLayout, ELayout>() ||
+                                      is_NGCDHW_GKZYXC_NGKDHW<ALayout, BLayout, ELayout>())
                     {
                         p_a_grid = type_convert<const ADataType*>(arg.p_workspace_);
                         p_e_grid = type_convert<EDataType*>(arg.p_workspace_) +
-                                   arg.GetWorkspaceATensorSizeBytes() / sizeof(EDataType);
+                                   (arg.GetWorkspaceATensorSizeBytes() +
+                                    arg.GetWorkspaceBTensorSizeBytes()) /
+                                       sizeof(EDataType);
                     }
 
                     const auto kernel = kernel_grouped_conv_fwd_multiple_abd_xdl_cshuffle<
@@ -974,8 +1062,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                         dim3(gdx, gdy, gdz),
                         dim3(BlockSize),
                         0,
-                        p_a_grid,              // Pass just A descriptor instead of tuple
-                        arg.p_bs_grid_.At(I0), // Pass just B descriptor instead of tuple
+                        p_a_grid,
+                        p_b_grid,
                         arg.p_ds_grid_,
                         p_e_grid,
                         arg.a_element_op_,
@@ -1005,50 +1093,71 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
         {
             float avg_time = 0.f;
 
-            if constexpr(is_NGCHW_GKYXC_NGKHW<ALayout, BLayout, ELayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<ALayout, BLayout, ELayout>())
+            if constexpr(is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
+                         is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>())
             {
-                const index_t grid_size =
+                const index_t a_grid_size =
                     arg.elementwise_block_2_ctile_map_transpose_a_.CalculateGridSize(
                         arg.a_in_transpose_desc_);
+                const index_t b_grid_size =
+                    (is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>() ||
+                     is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>())
+                        ? arg.elementwise_block_2_ctile_map_transpose_b_.CalculateGridSize(
+                              arg.b_in_transpose_desc_)
+                        : 0; // Dont run transpose B if not needed
 
                 ADataType* p_a_out_grid = type_convert<ADataType*>(arg.p_workspace_);
+                BDataType* p_b_out_grid = type_convert<BDataType*>(arg.p_workspace_) +
+                                          arg.GetWorkspaceATensorSizeBytes() / sizeof(BDataType);
 
-                auto kernel_transpose = kernel_elementwise<GridwiseElementwiseInputTranspose,
-                                                           ck::Tuple<NGCHWTransposeDescType>,
-                                                           ck::Tuple<NHWGCTransposeDescType>,
-                                                           ck::Tuple<const ADataType*>,
-                                                           ck::Tuple<ADataType*>,
-                                                           Block2TileMapElementwise,
-                                                           element_wise::PassThrough>;
+                auto kernel_transpose = kernel_elementwise_dual<GridwiseElementwiseInputTranspose,
+                                                                GridwiseElementwiseWeightTranspose,
+                                                                ck::Tuple<NGCHWTransposeDescType>,
+                                                                ck::Tuple<GKCYXTransposeDescType>,
+                                                                ck::Tuple<NHWGCTransposeDescType>,
+                                                                ck::Tuple<GKYXCTransposeDescType>,
+                                                                ck::Tuple<const ADataType*>,
+                                                                ck::Tuple<const BDataType*>,
+                                                                ck::Tuple<ADataType*>,
+                                                                ck::Tuple<BDataType*>,
+                                                                Block2TileMapElementwise,
+                                                                Block2TileMapElementwise,
+                                                                element_wise::PassThrough>;
 
                 avg_time += launch_and_time_kernel(stream_config,
                                                    kernel_transpose,
-                                                   dim3(grid_size),
+                                                   dim3(a_grid_size + b_grid_size),
                                                    dim3(ElementwiseBlocksize),
                                                    0,
                                                    make_tuple(arg.a_in_transpose_desc_),
+                                                   make_tuple(arg.b_in_transpose_desc_),
                                                    make_tuple(arg.a_out_transpose_desc_),
+                                                   make_tuple(arg.b_out_transpose_desc_),
                                                    make_tuple(arg.p_as_grid_.At(I0)),
+                                                   make_tuple(arg.p_bs_grid_.At(I0)),
                                                    make_tuple(p_a_out_grid),
+                                                   make_tuple(p_b_out_grid),
                                                    arg.elementwise_block_2_ctile_map_transpose_a_,
-                                                   element_wise::PassThrough{});
+                                                   arg.elementwise_block_2_ctile_map_transpose_b_,
+                                                   element_wise::PassThrough{},
+                                                   a_grid_size);
             }
 
             avg_time += RunGemm(arg, stream_config);
 
-            if constexpr(is_NGCHW_GKYXC_NGKHW<ALayout, BLayout, ELayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<ALayout, BLayout, ELayout>())
+            if constexpr(is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
+                         is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>())
             {
                 const index_t grid_size =
                     arg.elementwise_block_2_ctile_map_transpose_e_.CalculateGridSize(
                         arg.e_in_transpose_desc_);
 
-                const EDataType* p_e_out_grid =
+                const EDataType* p_e_in_grid =
                     type_convert<EDataType*>(arg.p_workspace_) +
-                    arg.GetWorkspaceATensorSizeBytes() / sizeof(EDataType);
+                    (arg.GetWorkspaceATensorSizeBytes() + arg.GetWorkspaceBTensorSizeBytes()) /
+                        sizeof(EDataType);
 
-                EDataType* p_e_in_grid = arg.p_e_grid_;
+                EDataType* p_e_out_grid = arg.p_e_grid_;
 
                 auto kernel_transpose = kernel_elementwise<GridwiseElementwiseOutputTranspose,
                                                            ck::Tuple<NHWGCTransposeDescType>,
@@ -1065,8 +1174,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                                                    0,
                                                    make_tuple(arg.e_in_transpose_desc_),
                                                    make_tuple(arg.e_out_transpose_desc_),
-                                                   make_tuple(p_e_out_grid),
                                                    make_tuple(p_e_in_grid),
+                                                   make_tuple(p_e_out_grid),
                                                    arg.elementwise_block_2_ctile_map_transpose_e_,
                                                    element_wise::PassThrough{});
             }
@@ -1110,12 +1219,12 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
             // check if it's 1x1, stride=1 conv
             for(index_t i = 0; i < NDimSpatial; ++i)
             {
-                const index_t X          = arg.b_g_k_c_xs_lengths_[i + 3];
+                const index_t SpatialDim = arg.b_g_k_c_xs_lengths_[i + 3];
                 const index_t ConvStride = arg.conv_filter_strides_[i];
                 const index_t LeftPad    = arg.input_left_pads_[i];
                 const index_t RightPad   = arg.input_right_pads_[i];
 
-                if(!(X == 1 && ConvStride == 1 && LeftPad == 0 && RightPad == 0))
+                if(!(SpatialDim == 1 && ConvStride == 1 && LeftPad == 0 && RightPad == 0))
                 {
                     return false;
                 }
@@ -1127,11 +1236,11 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
             // check if it's 1x1 conv
             for(index_t i = 0; i < NDimSpatial; ++i)
             {
-                const index_t X        = arg.b_g_k_c_xs_lengths_[i + 3];
-                const index_t LeftPad  = arg.input_left_pads_[i];
-                const index_t RightPad = arg.input_right_pads_[i];
+                const index_t SpatialDim = arg.b_g_k_c_xs_lengths_[i + 3];
+                const index_t LeftPad    = arg.input_left_pads_[i];
+                const index_t RightPad   = arg.input_right_pads_[i];
 
-                if(!(X == 1 && LeftPad == 0 && RightPad == 0))
+                if(!(SpatialDim == 1 && LeftPad == 0 && RightPad == 0))
                 {
                     return false;
                 }
@@ -1152,10 +1261,6 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                     return false;
                 }
             }
-            if constexpr(!is_NSpatialGC_GKSpatial_NSpatialGK<ALayout, BLayout, ELayout>())
-            {
-                return false;
-            }
         }
 
         if constexpr(NumGroupsToMerge > 1)
@@ -1169,7 +1274,9 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                 return false;
             }
             if constexpr(!(is_NSpatialGC_GKSpatial_NSpatialGK<ALayout, BLayout, ELayout>() ||
-                           is_NGCSpatial_GKSpatial_NGKSpatial<ALayout, BLayout, ELayout>()))
+                           is_NGCSpatial_GKSpatial_NGKSpatial<ALayout, BLayout, ELayout>() ||
+                           is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
+                           is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>()))
             {
                 return false;
             }
@@ -1190,7 +1297,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                 // If not possible, check access per G
                 if(!(ABlockTransferSrcVectorDim == 1 && (C == 1 || NumGroupsToMerge == 1) &&
                      (is_NSpatialGC_GKSpatial_NSpatialGK<ALayout, BLayout, ELayout>() ||
-                      is_NGCSpatial_GKSpatial_NGKSpatial<ALayout, BLayout, ELayout>()) &&
+                      is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
+                      is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>()) &&
                      G % ABlockTransferSrcScalarPerVector == 0))
                 {
                     return false;
@@ -1208,7 +1316,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                      is_same_v<BLayout, ctc::G_K_ZYX_C> || is_same_v<BLayout, ctc::GKXC> ||
                      is_same_v<BLayout, ctc::GKYXC> || is_same_v<BLayout, ctc::GKZYXC> ||
                      is_same_v<BLayout, ctc::KXGC> || is_same_v<BLayout, ctc::KYXGC> ||
-                     is_same_v<BLayout, ctc::KZYXGC>)
+                     is_same_v<BLayout, ctc::KZYXGC> || is_same_v<BLayout, ctc::GKCX> ||
+                     is_same_v<BLayout, ctc::GKCYX> || is_same_v<BLayout, ctc::GKCZYX>)
 
         {
             if(!(BBlockTransferSrcVectorDim == 2 && C % BBlockTransferSrcScalarPerVector == 0))
@@ -1266,8 +1375,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
             }
         });
 
-        if constexpr(is_NGCHW_GKYXC_NGKHW<ALayout, BLayout, ELayout>() ||
-                     is_NGCDHW_GKZYXC_NGKDHW<ALayout, BLayout, ELayout>())
+        if constexpr(is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
+                     is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>())
         {
             if((G * C) % CDEBlockTransferScalarPerVector_NPerBlock != 0)
             {
@@ -1293,6 +1402,25 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
             {
                 return false;
             }
+
+            if(!arg.p_workspace_)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Warning: Workspace for "
+                                 "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle::Argument is not "
+                                 "allocated, use SetWorkSpacePointer."
+                              << std::endl;
+                }
+                return false;
+            }
+
+            constexpr long_index_t TwoGB = (long_index_t{1} << 31);
+            if(!(arg.a_out_transpose_desc_.GetElementSpaceSize() * sizeof(ADataType) <= TwoGB &&
+                 arg.e_in_transpose_desc_.GetElementSpaceSize() * sizeof(EDataType) <= TwoGB))
+            {
+                return false;
+            }
         }
 
         if(!valid)
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
index 9363d7ecb9..a93e6ded96 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -11,6 +11,7 @@
 
 #include "ck/library/utility/numeric.hpp"
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -79,15 +80,12 @@ __global__ void
             [[maybe_unused]] const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
                 c_grid_desc_mblock_mperblock_nblock_nperblock,
             [[maybe_unused]] const ComputePtrOffset compute_ptr_offset_of_groups,
-            [[maybe_unused]] const ComputePtrOffset compute_ptr_offset_of_n,
-            [[maybe_unused]] const index_t groups_count)
+            [[maybe_unused]] const ComputePtrOffset compute_ptr_offset_of_n)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     // offset base pointer for each work-group
-    const index_t num_blocks_per_batch = __builtin_amdgcn_readfirstlane(gridDim.y / groups_count);
-    const index_t& num_blocks_per_n    = groups_count;
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / num_blocks_per_batch);
-    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / num_blocks_per_n);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);
+    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
 
     const long_index_t a_batch_offset =
         amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx));
@@ -141,15 +139,12 @@ __global__ void
             [[maybe_unused]] const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
                 c_grid_desc_mblock_mperblock_nblock_nperblock,
             [[maybe_unused]] const ComputePtrOffset compute_ptr_offset_of_groups,
-            [[maybe_unused]] const ComputePtrOffset compute_ptr_offset_of_n,
-            [[maybe_unused]] const index_t groups_count)
+            [[maybe_unused]] const ComputePtrOffset compute_ptr_offset_of_n)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     // offset base pointer for each work-group
-    const index_t num_blocks_per_batch = __builtin_amdgcn_readfirstlane(gridDim.y / groups_count);
-    const index_t& num_blocks_per_n    = groups_count;
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / num_blocks_per_batch);
-    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / num_blocks_per_n);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);
+    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
 
     const long_index_t a_batch_offset =
         amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx));
@@ -325,9 +320,9 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
     {
         namespace ctc = tensor_layout::convolution;
         using Layout  = std::conditional_t<
-            is_NGCHW_GKYXC_NGKHW<ALayout, BLayout, ELayout>(),
+            is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>(),
             ctc::NHWGC,
-            std::conditional_t<is_NGCDHW_GKZYXC_NGKDHW<ALayout, BLayout, ELayout>(),
+            std::conditional_t<is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>(),
                                ctc::NDHWGC,
                                ALay>>;
 
@@ -353,8 +348,16 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
     static auto
     MakeBGridDescriptor_BK0_N_BK1(const ConvToGemmFwdTransformer& conv_to_gemm_transformer)
     {
+        namespace ctc = tensor_layout::convolution;
+        using Layout  = std::conditional_t<
+            is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>(),
+            ctc::GKYXC,
+            std::conditional_t<is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>(),
+                               ctc::GKZYXC,
+                               BLay>>;
+
         const auto wei_gemmnraw_gemmkraw_desc =
-            conv_to_gemm_transformer.template MakeBDescriptor_N_K<BLay>();
+            conv_to_gemm_transformer.template MakeBDescriptor_N_K<Layout>();
 
         const auto wei_gemmn_gemmk_desc =
             matrix_padder.PadBDescriptor_N_K(wei_gemmnraw_gemmkraw_desc);
@@ -377,9 +380,9 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
     {
         namespace ctc = tensor_layout::convolution;
         using Layout  = std::conditional_t<
-            is_NGCHW_GKYXC_NGKHW<ALayout, BLayout, ELayout>(),
+            is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>(),
             ctc::NHWGK,
-            std::conditional_t<is_NGCDHW_GKZYXC_NGKDHW<ALayout, BLayout, ELayout>(),
+            std::conditional_t<is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>(),
                                ctc::NDHWGK,
                                ELay>>;
 
@@ -426,6 +429,13 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
         remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
                                     .template MakeNHWGCTransposeDesc<NDimSpatial>({}, {}))>;
 
+    using GKCYXTransposeDescType =
+        remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
+                                    .template MakeGKCYXTransposeDesc<NDimSpatial>({}, {}))>;
+    using GKYXCTransposeDescType =
+        remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
+                                    .template MakeGKYXCTransposeDesc<NDimSpatial>({}, {}))>;
+
     static constexpr index_t ElementwiseBlocksize = ClusterLengthNPerBlock * ClusterLengthNPerBlock;
 
     using GridwiseElementwiseInputTranspose =
@@ -446,6 +456,24 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                             I1,
                             I0>;
 
+    using GridwiseElementwiseWeightTranspose =
+        GridwiseElementwise<Tuple<GKCYXTransposeDescType>,
+                            Tuple<GKYXCTransposeDescType>,
+                            Tuple<const BDataType*>,
+                            Tuple<BDataType*>,
+                            Block2TileMapElementwise,
+                            element_wise::PassThrough,
+                            ElementwiseBlocksize,
+                            NPerBlock,
+                            NPerBlock,
+                            NPerBlock / ClusterLengthNPerBlock,
+                            NPerBlock / ClusterLengthNPerBlock,
+                            Sequence<1, 0>,
+                            Sequence<1>,
+                            Sequence<CDEBlockTransferScalarPerVector_NPerBlock>,
+                            I0,
+                            I1>;
+
     using GridwiseElementwiseOutputTranspose =
         GridwiseElementwise<Tuple<NHWGCTransposeDescType>,
                             Tuple<NGCHWTransposeDescType>,
@@ -508,12 +536,13 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
               p_b_grid_{},
               p_e_grid_{static_cast<EDataType*>(p_e)},
               a_g_n_c_wis_lengths_{a_g_n_c_wis_lengths},
-              a_g_n_c_wis_strides_{conv_ngchw_to_nhwgc_transformer.TransposeStrides(
+              a_g_n_c_wis_strides_{conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(
                   a_g_n_c_wis_lengths, a_g_n_c_wis_strides)},
               b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths},
-              b_g_k_c_xs_strides_{b_g_k_c_xs_strides},
+              b_g_k_c_xs_strides_{conv_ngchw_to_nhwgc_transformer.TransposeWeiStrides(
+                  b_g_k_c_xs_lengths, b_g_k_c_xs_strides)},
               e_g_n_k_wos_lengths_{e_g_n_k_wos_lengths},
-              e_g_n_k_wos_strides_{conv_ngchw_to_nhwgc_transformer.TransposeStrides(
+              e_g_n_k_wos_strides_{conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(
                   e_g_n_k_wos_lengths, e_g_n_k_wos_strides)},
               conv_filter_strides_{conv_filter_strides},
               conv_filter_dilations_{conv_filter_dilations},
@@ -559,8 +588,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
             e_grid_desc_mblock_mperblock_nblock_nperblock_ =
                 MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(e_grid_desc_m_n_);
 
-            if constexpr(is_NGCHW_GKYXC_NGKHW<ALayout, BLayout, ELayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<ALayout, BLayout, ELayout>())
+            if constexpr(is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>() ||
+                         is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>())
             {
                 // Use not modified base strides
                 a_in_transpose_desc_ =
@@ -570,9 +599,18 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                     conv_ngchw_to_nhwgc_transformer.template MakeNHWGCTransposeDesc<NDimSpatial>(
                         a_g_n_c_wis_lengths, a_g_n_c_wis_strides);
 
+                b_in_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeGKCYXTransposeDesc<NDimSpatial>(
+                        b_g_k_c_xs_lengths, b_g_k_c_xs_strides);
+                b_out_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeGKYXCTransposeDesc<NDimSpatial>(
+                        b_g_k_c_xs_lengths, b_g_k_c_xs_strides);
+
                 e_in_transpose_desc_ =
                     conv_ngchw_to_nhwgc_transformer.template MakeNHWGCTransposeDesc<NDimSpatial>(
                         e_g_n_k_wos_lengths, e_g_n_k_wos_strides);
+                elementwise_block_2_ctile_map_transpose_b_ = Block2TileMapElementwise{
+                    b_in_transpose_desc_.GetLength(I0), b_in_transpose_desc_.GetLength(I1)};
                 e_out_transpose_desc_ =
                     conv_ngchw_to_nhwgc_transformer.template MakeNGCHWTransposeDesc<NDimSpatial>(
                         e_g_n_k_wos_lengths, e_g_n_k_wos_strides);
@@ -586,21 +624,13 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
 
         std::size_t GetWorkspaceATensorSizeBytes() const
         {
-            return sizeof(ADataType) * a_in_transpose_desc_.GetElementSpaceSize();
-        }
-
-        std::size_t GetWorkspaceETensorSizeBytes() const
-        {
-            return sizeof(EDataType) * e_out_transpose_desc_.GetElementSpaceSize();
-        }
-
-        std::size_t GetWorkspaceSizeBytes() const
-        {
-            // Transpose require workspace for A and B
-            if constexpr(is_NGCHW_GKYXC_NGKHW<ALayout, BLayout, ELayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<ALayout, BLayout, ELayout>())
+            if constexpr(is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
+                         is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>())
             {
-                return GetWorkspaceATensorSizeBytes() + GetWorkspaceETensorSizeBytes();
+                const long_index_t a_acum = ck::accumulate_n<long_index_t>(
+                    a_g_n_c_wis_lengths_.begin(), NDimSpatial + I3, 1, std::multiplies<>());
+                // Align to 128B
+                return math::integer_divide_ceil(sizeof(ADataType) * a_acum, 128) * 128;
             }
             else
             {
@@ -608,6 +638,43 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
             }
         }
 
+        std::size_t GetWorkspaceBTensorSizeBytes() const
+        {
+            if constexpr(is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>() ||
+                         is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>())
+            {
+                const long_index_t b_acum = ck::accumulate_n<long_index_t>(
+                    b_g_k_c_xs_lengths_.begin(), NDimSpatial + I3, 1, std::multiplies<>());
+                // Align to 128B
+                return math::integer_divide_ceil(sizeof(BDataType) * b_acum, 128) * 128;
+            }
+            else
+            {
+                return 0;
+            }
+        }
+
+        std::size_t GetWorkspaceETensorSizeBytes() const
+        {
+            if constexpr(is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
+                         is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>())
+            {
+                const long_index_t e_accum = ck::accumulate_n<long_index_t>(
+                    e_g_n_k_wos_lengths_.begin(), NDimSpatial + I3, 1, std::multiplies<>());
+                return sizeof(EDataType) * e_accum;
+            }
+            else
+            {
+                return 0;
+            }
+        }
+
+        std::size_t GetWorkspaceSizeBytes() const
+        {
+            return GetWorkspaceATensorSizeBytes() + GetWorkspaceBTensorSizeBytes() +
+                   GetWorkspaceETensorSizeBytes();
+        }
+
         void Print() const
         {
             std::cout << "A[AK0, M, AK1]: " << a_grid_desc_ak0_m_ak1_ << std::endl;
@@ -657,10 +724,12 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
 
         // block-to-e-tile map
         Block2TileMapElementwise elementwise_block_2_ctile_map_transpose_a_,
-            elementwise_block_2_ctile_map_transpose_e_;
+            elementwise_block_2_ctile_map_transpose_b_, elementwise_block_2_ctile_map_transpose_e_;
 
         NGCHWTransposeDescType a_in_transpose_desc_, e_out_transpose_desc_;
         NHWGCTransposeDescType a_out_transpose_desc_, e_in_transpose_desc_;
+        GKCYXTransposeDescType b_in_transpose_desc_;
+        GKYXCTransposeDescType b_out_transpose_desc_;
     };
 
     // Invoker
@@ -692,24 +761,30 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
             std::tie(gdx, gdy, gdz) =
                 GridwiseGemm::CalculateGridSize(GemmM, GemmN, I1 /*arg.KBatch*/);
 
-            gdy *= arg.num_group_ * num_workgroups_per_Conv_N;
+            gdy = arg.num_group_;
+            gdz = num_workgroups_per_Conv_N;
 
             index_t K_split                  = (GemmK + KPerBlock - 1) / KPerBlock * KPerBlock;
             const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
 
             const ADataType* p_a_grid = arg.p_a_grid_;
+            const BDataType* p_b_grid = arg.p_b_grid_;
             EDataType* p_e_grid       = arg.p_e_grid_;
 
-            if constexpr(is_NGCHW_GKYXC_NGKHW<ALayout, BLayout, ELayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<ALayout, BLayout, ELayout>())
+            if constexpr(is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>() ||
+                         is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>())
             {
                 p_a_grid = type_convert<const ADataType*>(arg.p_workspace_);
-                p_e_grid = type_convert<EDataType*>(arg.p_workspace_) +
-                           arg.GetWorkspaceATensorSizeBytes() / sizeof(EDataType);
+                p_b_grid = type_convert<const BDataType*>(arg.p_workspace_) +
+                           arg.GetWorkspaceATensorSizeBytes() / sizeof(BDataType);
+                p_e_grid =
+                    type_convert<EDataType*>(arg.p_workspace_) +
+                    (arg.GetWorkspaceATensorSizeBytes() + arg.GetWorkspaceBTensorSizeBytes()) /
+                        sizeof(EDataType);
             }
 
             typename GridwiseGemm::Argument gemm_arg{
-                p_a_grid, arg.p_b_grid_, p_e_grid, GemmM, GemmN, GemmK, I0, I0, I0, I1};
+                p_a_grid, p_b_grid, p_e_grid, GemmM, GemmN, GemmK, I0, I0, I0, I1};
 
             const auto Run = [&](const auto& kernel) {
                 if(stream_config.flush_cache)
@@ -741,8 +816,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                         arg.b_grid_desc_bk0_n_bk1_,
                         arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
                         arg.compute_ptr_offset_of_groups_,
-                        arg.compute_ptr_offset_of_n_,
-                        arg.num_group_);
+                        arg.compute_ptr_offset_of_n_);
                 }
                 else
                 {
@@ -757,8 +831,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                                                arg.b_grid_desc_bk0_n_bk1_,
                                                arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
                                                arg.compute_ptr_offset_of_groups_,
-                                               arg.compute_ptr_offset_of_n_,
-                                               arg.num_group_);
+                                               arg.compute_ptr_offset_of_n_);
                 }
             };
 
@@ -1008,50 +1081,68 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
         {
             float avg_time = 0.f;
 
-            if constexpr(is_NGCHW_GKYXC_NGKHW<ALayout, BLayout, ELayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<ALayout, BLayout, ELayout>())
+            if constexpr(is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>() ||
+                         is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>())
             {
-                const index_t grid_size =
+                const index_t a_grid_size =
                     arg.elementwise_block_2_ctile_map_transpose_a_.CalculateGridSize(
                         arg.a_in_transpose_desc_);
+                const index_t b_grid_size =
+                    arg.elementwise_block_2_ctile_map_transpose_b_.CalculateGridSize(
+                        arg.b_in_transpose_desc_);
 
                 ADataType* p_a_out_grid = type_convert<ADataType*>(arg.p_workspace_);
+                BDataType* p_b_out_grid = type_convert<BDataType*>(arg.p_workspace_) +
+                                          arg.GetWorkspaceATensorSizeBytes() / sizeof(BDataType);
 
-                auto kernel_transpose = kernel_elementwise<GridwiseElementwiseInputTranspose,
-                                                           ck::Tuple<NGCHWTransposeDescType>,
-                                                           ck::Tuple<NHWGCTransposeDescType>,
-                                                           ck::Tuple<const ADataType*>,
-                                                           ck::Tuple<ADataType*>,
-                                                           Block2TileMapElementwise,
-                                                           element_wise::PassThrough>;
+                auto kernel_transpose = kernel_elementwise_dual<GridwiseElementwiseInputTranspose,
+                                                                GridwiseElementwiseWeightTranspose,
+                                                                ck::Tuple<NGCHWTransposeDescType>,
+                                                                ck::Tuple<GKCYXTransposeDescType>,
+                                                                ck::Tuple<NHWGCTransposeDescType>,
+                                                                ck::Tuple<GKYXCTransposeDescType>,
+                                                                ck::Tuple<const ADataType*>,
+                                                                ck::Tuple<const BDataType*>,
+                                                                ck::Tuple<ADataType*>,
+                                                                ck::Tuple<BDataType*>,
+                                                                Block2TileMapElementwise,
+                                                                Block2TileMapElementwise,
+                                                                element_wise::PassThrough>;
 
                 avg_time += launch_and_time_kernel(stream_config,
                                                    kernel_transpose,
-                                                   dim3(grid_size),
+                                                   dim3(a_grid_size + b_grid_size),
                                                    dim3(ElementwiseBlocksize),
                                                    0,
                                                    make_tuple(arg.a_in_transpose_desc_),
+                                                   make_tuple(arg.b_in_transpose_desc_),
                                                    make_tuple(arg.a_out_transpose_desc_),
+                                                   make_tuple(arg.b_out_transpose_desc_),
                                                    make_tuple(arg.p_a_grid_),
+                                                   make_tuple(arg.p_b_grid_),
                                                    make_tuple(p_a_out_grid),
+                                                   make_tuple(p_b_out_grid),
                                                    arg.elementwise_block_2_ctile_map_transpose_a_,
-                                                   element_wise::PassThrough{});
+                                                   arg.elementwise_block_2_ctile_map_transpose_b_,
+                                                   element_wise::PassThrough{},
+                                                   a_grid_size);
             }
 
             avg_time += RunGemm(arg, stream_config);
 
-            if constexpr(is_NGCHW_GKYXC_NGKHW<ALayout, BLayout, ELayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<ALayout, BLayout, ELayout>())
+            if constexpr(is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>() ||
+                         is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>())
             {
                 const index_t grid_size =
                     arg.elementwise_block_2_ctile_map_transpose_e_.CalculateGridSize(
                         arg.e_in_transpose_desc_);
 
-                const EDataType* p_e_out_grid =
+                const EDataType* p_e_in_grid =
                     type_convert<EDataType*>(arg.p_workspace_) +
-                    arg.GetWorkspaceATensorSizeBytes() / sizeof(EDataType);
+                    (arg.GetWorkspaceATensorSizeBytes() + arg.GetWorkspaceBTensorSizeBytes()) /
+                        sizeof(EDataType);
 
-                EDataType* p_e_in_grid = arg.p_e_grid_;
+                EDataType* p_e_out_grid = arg.p_e_grid_;
 
                 auto kernel_transpose = kernel_elementwise<GridwiseElementwiseOutputTranspose,
                                                            ck::Tuple<NHWGCTransposeDescType>,
@@ -1068,8 +1159,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                                                    0,
                                                    make_tuple(arg.e_in_transpose_desc_),
                                                    make_tuple(arg.e_out_transpose_desc_),
-                                                   make_tuple(p_e_out_grid),
                                                    make_tuple(p_e_in_grid),
+                                                   make_tuple(p_e_out_grid),
                                                    arg.elementwise_block_2_ctile_map_transpose_e_,
                                                    element_wise::PassThrough{});
             }
@@ -1114,12 +1205,12 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
             // check if it's 1x1, stride=1 conv
             for(index_t i = 0; i < NDimSpatial; ++i)
             {
-                const index_t X          = arg.b_g_k_c_xs_lengths_[i + 3];
+                const index_t SpatialDim = arg.b_g_k_c_xs_lengths_[i + 3];
                 const index_t ConvStride = arg.conv_filter_strides_[i];
                 const index_t LeftPad    = arg.input_left_pads_[i];
                 const index_t RightPad   = arg.input_right_pads_[i];
 
-                if(!(X == 1 && ConvStride == 1 && LeftPad == 0 && RightPad == 0))
+                if(!(SpatialDim == 1 && ConvStride == 1 && LeftPad == 0 && RightPad == 0))
                 {
                     return false;
                 }
@@ -1131,11 +1222,11 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
             // check if it's 1x1 conv
             for(index_t i = 0; i < NDimSpatial; ++i)
             {
-                const index_t X        = arg.b_g_k_c_xs_lengths_[i + 3];
-                const index_t LeftPad  = arg.input_left_pads_[i];
-                const index_t RightPad = arg.input_right_pads_[i];
+                const index_t SpatialDim = arg.b_g_k_c_xs_lengths_[i + 3];
+                const index_t LeftPad    = arg.input_left_pads_[i];
+                const index_t RightPad   = arg.input_right_pads_[i];
 
-                if(!(X == 1 && LeftPad == 0 && RightPad == 0))
+                if(!(SpatialDim == 1 && LeftPad == 0 && RightPad == 0))
                 {
                     return false;
                 }
@@ -1167,7 +1258,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                      is_same_v<BLayout, ctc::G_K_ZYX_C> || is_same_v<BLayout, ctc::GKXC> ||
                      is_same_v<BLayout, ctc::GKYXC> || is_same_v<BLayout, ctc::GKZYXC> ||
                      is_same_v<BLayout, ctc::KXGC> || is_same_v<BLayout, ctc::KYXGC> ||
-                     is_same_v<BLayout, ctc::KZYXGC>)
+                     is_same_v<BLayout, ctc::KZYXGC> || is_same_v<BLayout, ctc::GKCX> ||
+                     is_same_v<BLayout, ctc::GKCYX> || is_same_v<BLayout, ctc::GKCZYX>)
 
         {
             if(!(BBlockTransferSrcVectorDim == 2 && C % BBlockTransferSrcScalarPerVector == 0))
@@ -1180,8 +1272,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
             return false;
         }
 
-        if constexpr(is_NGCHW_GKYXC_NGKHW<ALayout, BLayout, ELayout>() ||
-                     is_NGCDHW_GKZYXC_NGKDHW<ALayout, BLayout, ELayout>())
+        if constexpr(is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>() ||
+                     is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>())
         {
             if((G * C) % CDEBlockTransferScalarPerVector_NPerBlock != 0)
             {
@@ -1207,6 +1299,25 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
             {
                 return false;
             }
+
+            if(!arg.p_workspace_)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Warning: Workspace for "
+                                 "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3::Argument is not "
+                                 "allocated, use SetWorkSpacePointer."
+                              << std::endl;
+                }
+                return false;
+            }
+
+            constexpr long_index_t TwoGB = (long_index_t{1} << 31);
+            if(!(arg.a_out_transpose_desc_.GetElementSpaceSize() * sizeof(ADataType) <= TwoGB &&
+                 arg.e_in_transpose_desc_.GetElementSpaceSize() * sizeof(EDataType) <= TwoGB))
+            {
+                return false;
+            }
         }
 
         // check vector access of E
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp
index 3bcd8859aa..5de429f9e5 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -59,6 +59,22 @@ constexpr bool is_NGCHW_GKYXC_NGKHW()
            is_same_v<WeiLayout, tensor_layout::convolution::GKYXC> &&
            is_same_v<OutLayout, tensor_layout::convolution::NGKHW>;
 }
+
+template <typename InLayout, typename WeiLayout, typename OutLayout>
+constexpr bool is_NGCHW_GKCYX_NGKHW()
+{
+    return is_same_v<InLayout, tensor_layout::convolution::NGCHW> &&
+           is_same_v<WeiLayout, tensor_layout::convolution::GKCYX> &&
+           is_same_v<OutLayout, tensor_layout::convolution::NGKHW>;
+}
+
+template <typename InLayout, typename WeiLayout, typename OutLayout>
+constexpr bool is_NGCHW_NGKHW()
+{
+    return is_same_v<InLayout, tensor_layout::convolution::NGCHW> &&
+           is_same_v<OutLayout, tensor_layout::convolution::NGKHW>;
+}
+
 // 3d
 template <typename InLayout, typename WeiLayout, typename OutLayout>
 constexpr bool is_NDHWGC_GKZYXC_NDHWGK()
@@ -84,6 +100,21 @@ constexpr bool is_NGCDHW_GKZYXC_NGKDHW()
            is_same_v<OutLayout, tensor_layout::convolution::NGKDHW>;
 }
 
+template <typename InLayout, typename WeiLayout, typename OutLayout>
+constexpr bool is_NGCDHW_GKCZYX_NGKDHW()
+{
+    return is_same_v<InLayout, tensor_layout::convolution::NGCDHW> &&
+           is_same_v<WeiLayout, tensor_layout::convolution::GKCZYX> &&
+           is_same_v<OutLayout, tensor_layout::convolution::NGKDHW>;
+}
+
+template <typename InLayout, typename WeiLayout, typename OutLayout>
+constexpr bool is_NGCDHW_NGKDHW()
+{
+    return is_same_v<InLayout, tensor_layout::convolution::NGCDHW> &&
+           is_same_v<OutLayout, tensor_layout::convolution::NGKDHW>;
+}
+
 template <typename InLayout, typename WeiLayout, typename OutLayout>
 constexpr bool is_NSpatialGC_GKSpatial_NSpatialGK()
 {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
index 959fc890b8..10d8a4a44d 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
@@ -1,6 +1,6 @@
 #pragma once
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -8,6 +8,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -420,7 +421,8 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm<ALayout,
                  CDEElementwiseOperation cde_element_op)
             : a_element_op_{a_element_op},
               b_element_op_{b_element_op},
-              cde_element_op_{cde_element_op}
+              cde_element_op_{cde_element_op},
+              gemm_kernel_host_args_{nullptr}
         {
             grid_size_ = 0;
 
@@ -538,6 +540,7 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm<ALayout,
         std::vector<Tuple<index_t, index_t>> b_mtx_nraw_kraw_;
 
         index_t grid_size_;
+        void* gemm_kernel_host_args_;
     };
 
     // Invoker
@@ -545,7 +548,10 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm<ALayout,
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        float Run(const Argument& arg,
+                  const StreamConfig& stream_config = StreamConfig{},
+                  hipStream_t cpy_stream            = nullptr,
+                  hipEvent_t cpy_event              = nullptr)
         {
             auto K0 = arg.gemm_desc_kernel_arg_[0].a_grid_desc_k0_m_k1_.GetLength(I0);
             bool all_has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K0);
@@ -602,12 +608,36 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm<ALayout,
                 }
             }
 
-            hipGetErrorString(
-                hipMemcpyAsync(arg.p_workspace_,
-                               arg.gemm_desc_kernel_arg_.data(),
-                               arg.gemm_desc_kernel_arg_.size() * sizeof(GemmKernelArg),
-                               hipMemcpyHostToDevice,
-                               stream_config.stream_id_));
+            // If the user provides copy stream and copy event, we assume that they're also
+            // responsible for providing allocated host memory (eg. pinned) which
+            // would be used to copy kernel arguments to the device.
+            if(cpy_stream && cpy_event)
+            {
+                if(arg.gemm_kernel_host_args_ == nullptr)
+                {
+                    std::ostringstream err;
+                    err << "No memory has been allocated for gemm kernel host args "
+                        << "when providing the copy stream and copy event! In " << __FILE__ << ":"
+                        << __LINE__ << ", in function: " << __func__;
+                    throw std::runtime_error(err.str());
+                }
+                hipGetErrorString(hipMemcpyAsync(arg.p_workspace_,
+                                                 arg.gemm_kernel_host_args_,
+                                                 arg.group_count_ * sizeof(GemmKernelArg),
+                                                 hipMemcpyHostToDevice,
+                                                 cpy_stream));
+                hipGetErrorString(hipEventRecord(cpy_event, cpy_stream));
+                hipGetErrorString(hipEventSynchronize(cpy_event));
+            }
+            else // In this case CK owns memory allocated on host.
+            {
+                hipGetErrorString(
+                    hipMemcpyAsync(arg.p_workspace_,
+                                   arg.gemm_desc_kernel_arg_.data(),
+                                   arg.gemm_desc_kernel_arg_.size() * sizeof(GemmKernelArg),
+                                   hipMemcpyHostToDevice,
+                                   stream_config.stream_id_));
+            }
 
             auto launch_kernel = [&](auto has_main_k_block_loop,
                                      auto has_double_tail_k_block_loop) {
@@ -762,6 +792,40 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm<ALayout,
     {
         return dynamic_cast<const Argument*>(p_arg)->group_count_ * sizeof(GemmKernelArg);
     }
+
+    size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const override
+    {
+        return GetWorkSpaceSize(p_arg);
+    }
+
+    size_t GetHostKernelArgSize(const BaseArgument* p_arg) const { return GetWorkSpaceSize(p_arg); }
+
+    void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const override
+    {
+        return this->SetWorkSpacePointer(p_arg, p_dev_kernel_args);
+    }
+
+    //----------------------------------------------------------------------------------------------
+    /// @brief      Sets the host kernel arguments pointer and copies that data on the host side.
+    ///             This function can be utilised to use pinned memory for the host args and
+    ///             achieve fully async data copy.
+    ///
+    /// @param      p_arg              The pointer to the Argument we're going to update.
+    /// @param[in]  p_host_kernel_args The pointer to the host memory where the kernel
+    ///                                arguments will be copied
+    void SetHostKernelArgsPointer(BaseArgument* p_arg, void* p_host_kernel_args) const
+    {
+        Argument* pArg_ = dynamic_cast<Argument*>(p_arg);
+        if(!pArg_)
+        {
+            throw std::runtime_error("Failed to cast argument pointer!");
+        }
+
+        pArg_->gemm_kernel_host_args_ = p_host_kernel_args;
+        std::copy(pArg_->gemm_desc_kernel_arg_.begin(),
+                  pArg_->gemm_desc_kernel_arg_.end(),
+                  static_cast<GemmKernelArg*>(pArg_->gemm_kernel_host_args_));
+    }
 };
 
 } // namespace device
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
index d692aa05ce..18872e38ea 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -8,6 +8,7 @@
 #include <tuple>
 
 #include "ck/ck.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
 #include "ck/host_utility/hip_check_error.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
index 8b40eea56c..aa70a24fc1 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
@@ -1,6 +1,6 @@
 #pragma once
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -8,6 +8,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -500,6 +501,7 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
         std::vector<Tuple<index_t, index_t>> b_mtx_nraw_kraw_;
 
         index_t grid_size_;
+        void* gemm_kernel_host_args_;
     };
 
     // Invoker
@@ -507,7 +509,10 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        float Run(const Argument& arg,
+                  const StreamConfig& stream_config = StreamConfig{},
+                  hipStream_t cpy_stream            = nullptr,
+                  hipEvent_t cpy_event              = nullptr)
         {
             bool has_main_k_block_loop = true;
 
@@ -556,12 +561,36 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
                 }
             }
 
-            hipGetErrorString(
-                hipMemcpyAsync(arg.p_workspace_,
-                               arg.gemm_desc_kernel_arg_.data(),
-                               arg.gemm_desc_kernel_arg_.size() * sizeof(GemmBiasTransKernelArg),
-                               hipMemcpyHostToDevice,
-                               stream_config.stream_id_));
+            // If the user provides copy stream and copy event, we assume that they're also
+            // responsible for providing allocated host memory (eg. pinned) which
+            // would be used to copy kernel arguments to the device.
+            if(cpy_stream && cpy_event)
+            {
+                if(arg.gemm_kernel_host_args_ == nullptr)
+                {
+                    std::ostringstream err;
+                    err << "No memory has been allocated for gemm kernel host args "
+                        << "when providing the copy stream and copy event! In " << __FILE__ << ":"
+                        << __LINE__ << ", in function: " << __func__;
+                    throw std::runtime_error(err.str());
+                }
+                hipGetErrorString(hipMemcpyAsync(arg.p_workspace_,
+                                                 arg.gemm_kernel_host_args_,
+                                                 arg.group_count_ * sizeof(GemmBiasTransKernelArg),
+                                                 hipMemcpyHostToDevice,
+                                                 cpy_stream));
+                hipGetErrorString(hipEventRecord(cpy_event, cpy_stream));
+                hipGetErrorString(hipEventSynchronize(cpy_event));
+            }
+            else // In this case CK owns memory allocated on host.
+            {
+                hipGetErrorString(hipMemcpyAsync(arg.p_workspace_,
+                                                 arg.gemm_desc_kernel_arg_.data(),
+                                                 arg.gemm_desc_kernel_arg_.size() *
+                                                     sizeof(GemmBiasTransKernelArg),
+                                                 hipMemcpyHostToDevice,
+                                                 stream_config.stream_id_));
+            }
 
             float ave_time = 0;
 
@@ -735,6 +764,31 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
     {
         return this->SetWorkSpacePointer(p_arg, p_dev_kernel_args);
     }
+
+    size_t GetHostKernelArgSize(const BaseArgument* p_arg) const { return GetWorkSpaceSize(p_arg); }
+
+    //----------------------------------------------------------------------------------------------
+    /// @brief      Sets the host kernel arguments pointer and copies that data on the host side.
+    ///             This function can be utilised to use pinned memory for the host args and
+    ///             achieve fully async data copy.
+    ///
+    /// @param      p_arg              The pointer to the Argument we're going to update.
+    /// @param[in]  p_host_kernel_args The pointer to the host memory where the kernel
+    ///                                arguments will be copied
+    ///
+    void SetHostKernelArgsPointer(BaseArgument* p_arg, void* p_host_kernel_args) const
+    {
+        Argument* pArg_ = dynamic_cast<Argument*>(p_arg);
+        if(!pArg_)
+        {
+            throw std::runtime_error("Failed to cast argument pointer!");
+        }
+
+        pArg_->gemm_kernel_host_args_ = p_host_kernel_args;
+        std::copy(pArg_->gemm_desc_kernel_arg_.begin(),
+                  pArg_->gemm_desc_kernel_arg_.end(),
+                  static_cast<GemmBiasTransKernelArg*>(pArg_->gemm_kernel_host_args_));
+    }
 };
 
 } // namespace device
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
index 994c667fbc..01f52881f4 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -7,6 +7,7 @@
 #include <sstream>
 
 #include "ck/ck.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
 #include "ck/host_utility/hip_check_error.hpp"
@@ -244,7 +245,7 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
                  std::vector<void*>& p_Es,
                  std::vector<GemmDesc>& gemm_descs,
                  index_t kbatch)
-            : K_BATCH{kbatch}
+            : K_BATCH{kbatch}, gemm_kernel_host_args_{nullptr}
         {
             grid_size_   = 0;
             group_count_ = ck::type_convert<ck::index_t>(gemm_descs.size());
@@ -365,13 +366,17 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
         index_t skipped_group_count_;
 
         std::vector<GemmTransKernelArg> gemm_kernel_args_;
+        void* gemm_kernel_host_args_;
         index_t grid_size_;
     };
 
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        float Run(const Argument& arg,
+                  const StreamConfig& stream_config = StreamConfig{},
+                  hipStream_t cpy_stream            = nullptr,
+                  hipEvent_t cpy_event              = nullptr)
         {
             index_t K0                       = arg.gemm_kernel_args_[0].karg_.K0Padded;
             bool all_have_kbatch_gt_one      = arg.gemm_kernel_args_[0].karg_.k_batch > 1;
@@ -419,12 +424,37 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
                 }
             }
 
-            hip_check_error(
-                hipMemcpyAsync(arg.p_workspace_,
-                               arg.gemm_kernel_args_.data(),
-                               arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
-                               hipMemcpyHostToDevice,
-                               stream_config.stream_id_));
+            // If the user provides copy stream and copy event, we assume that they're also
+            // responsible for providing allocated host memory (eg. pinned) which
+            // would be used to copy kernel arguments to the device.
+            if(cpy_stream && cpy_event)
+            {
+                if(arg.gemm_kernel_host_args_ == nullptr)
+                {
+                    std::ostringstream err;
+                    err << "No memory has been allocated for gemm kernel host args "
+                        << "when providing the copy stream and copy event! In " << __FILE__ << ":"
+                        << __LINE__ << ", in function: " << __func__;
+                    throw std::runtime_error(err.str());
+                }
+                hip_check_error(hipMemcpyAsync(arg.p_workspace_,
+                                               arg.gemm_kernel_host_args_,
+                                               arg.group_count_ * sizeof(GemmTransKernelArg),
+                                               hipMemcpyHostToDevice,
+                                               cpy_stream));
+                hip_check_error(hipEventRecord(cpy_event, cpy_stream));
+                hip_check_error(hipEventSynchronize(cpy_event));
+            }
+            else // In this case CK owns memory allocated on host.
+            {
+
+                hip_check_error(
+                    hipMemcpyAsync(arg.p_workspace_,
+                                   arg.gemm_kernel_args_.data(),
+                                   arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
+                                   hipMemcpyHostToDevice,
+                                   stream_config.stream_id_));
+            }
 
             float ave_time = 0;
 
@@ -652,6 +682,8 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
         return GetWorkSpaceSize(p_arg);
     }
 
+    size_t GetHostKernelArgSize(const BaseArgument* p_arg) const { return GetWorkSpaceSize(p_arg); }
+
     // TODO: deperecation notice.
     static void SetKBatchSize(Argument& arg, index_t kbatch) { arg.UpdateKBatch(kbatch); }
 
@@ -673,6 +705,29 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
     {
         return this->SetWorkSpacePointer(p_arg, p_dev_kernel_args);
     }
+
+    //----------------------------------------------------------------------------------------------
+    /// @brief      Sets the host kernel arguments pointer and copies that data on the host side.
+    ///             This function can be utilised to use pinned memory for the host args and
+    ///             achieve fully async data copy.
+    ///
+    /// @param      p_arg              The pointer to the Argument we're going to update.
+    /// @param[in]  p_host_kernel_args The pointer to the host memory where the kernel
+    ///                                arguments will be copied
+    ///
+    void SetHostKernelArgsPointer(BaseArgument* p_arg, void* p_host_kernel_args) const
+    {
+        Argument* pArg_ = dynamic_cast<Argument*>(p_arg);
+        if(!pArg_)
+        {
+            throw std::runtime_error("Failed to cast argument pointer!");
+        }
+
+        pArg_->gemm_kernel_host_args_ = p_host_kernel_args;
+        std::copy(pArg_->gemm_kernel_args_.begin(),
+                  pArg_->gemm_kernel_args_.end(),
+                  static_cast<GemmTransKernelArg*>(pArg_->gemm_kernel_host_args_));
+    }
 };
 
 } // namespace device
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp
new file mode 100644
index 0000000000..08d177035e
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp
@@ -0,0 +1,527 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/flush_cache.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename CDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename CDEShuffleBlockTransferScalarPerVectors,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          index_t ActivationOP                        = 0,
+          bool NSwizzle                               = false,
+          bool IsInputGemm                            = true,
+          bool MulRoutedWeight                        = true,
+          bool PerTokenQuant                          = true,
+          typename IndexType                          = index_t,
+          typename ComputeTypeA                       = CDataType,
+          typename ComputeTypeB                       = ComputeTypeA,
+          typename LDSTypeA                           = ComputeTypeA,
+          typename LDSTypeB                           = ComputeTypeB>
+struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
+                                                                   BLayout,
+                                                                   DsLayout,
+                                                                   CLayout,
+                                                                   ADataType,
+                                                                   BDataType,
+                                                                   DsDataType,
+                                                                   CDataType,
+                                                                   AElementwiseOperation,
+                                                                   BElementwiseOperation,
+                                                                   CElementwiseOperation>
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+    using GridwiseGemm =
+        GridwiseMoeGemm<ALayout,
+                        BLayout,
+                        DsLayout,
+                        CLayout,
+                        ADataType,
+                        BDataType,
+                        GemmAccDataType,
+                        CShuffleDataType,
+                        DsDataType,
+                        CDataType,
+                        AElementwiseOperation,
+                        BElementwiseOperation,
+                        CElementwiseOperation,
+                        GemmSpec,
+                        BlockSize,
+                        MPerBlock,
+                        NPerBlock,
+                        KPerBlock,
+                        AK1,
+                        BK1,
+                        MPerXDL,
+                        NPerXDL,
+                        MXdlPerWave,
+                        NXdlPerWave,
+                        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                        ABlockTransferThreadClusterArrangeOrder,
+                        ABlockTransferSrcAccessOrder,
+                        ABlockTransferSrcVectorDim,
+                        ABlockTransferSrcScalarPerVector,
+                        ABlockTransferDstScalarPerVector_AK1,
+                        false,
+                        ABlockLdsExtraM,
+                        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                        BBlockTransferThreadClusterArrangeOrder,
+                        BBlockTransferSrcAccessOrder,
+                        BBlockTransferSrcVectorDim,
+                        BBlockTransferSrcScalarPerVector,
+                        BBlockTransferDstScalarPerVector_BK1,
+                        false,
+                        BBlockLdsExtraN,
+                        CShuffleMXdlPerWavePerShuffle,
+                        CShuffleNXdlPerWavePerShuffle,
+                        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                        CDEShuffleBlockTransferScalarPerVectors,
+                        BlkGemmPipeSched,
+                        BlkGemmPipelineVer,
+                        ActivationOP,
+                        NSwizzle,
+                        IsInputGemm,
+                        MulRoutedWeight,
+                        PerTokenQuant,
+                        IndexType,
+                        ComputeTypeA,
+                        ComputeTypeB,
+                        LDSTypeA,
+                        LDSTypeB>;
+
+    using Argument = typename GridwiseGemm::Argument;
+
+    static constexpr index_t APackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    static constexpr index_t BPackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    int GetPreShuffleParameters() override { return NPerXDL; }
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+            }
+
+            if(!GridwiseGemm::CheckValidity(arg))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N);
+
+            float ave_time = 0;
+
+            index_t k_grain = arg.KBatch * KPerBlock;
+            index_t K_split = (arg.K + k_grain - 1) / k_grain * KPerBlock;
+
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
+
+            const auto RunKernel = [&](const auto& kernel) {
+                if(stream_config.flush_cache)
+                {
+
+                    std::array<std::size_t, NumDTensor> DsSize;
+
+                    Argument arg_ = arg;
+
+                    const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
+                        arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
+                    const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
+                        arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
+
+                    auto size_a_buffer = a_grid_desc_ak0_m_ak1.GetElementSpaceSize() *
+                                         sizeof(ADataType) / APackedSize;
+                    auto size_b_buffer = b_grid_desc_bk0_n_bk1.GetElementSpaceSize() *
+                                         sizeof(BDataType) / BPackedSize;
+
+                    const auto ds_grid_desc_m_n = GridwiseGemm::MakeDsGridDescriptor_M_N(
+                        arg_.M, arg_.MPadded, arg_.N, arg_.NPadded, arg_.StrideDs);
+
+                    static_for<0, NumDTensor, 1>{}([&](auto i) {
+                        using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+                        DsSize[i] = ds_grid_desc_m_n[i].GetElementSpaceSize() * sizeof(DDataType);
+                    });
+                    ck::utility::RotatingMemWrapperMultiD<Argument, DsDataType> rotating_mem(
+                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer, DsSize);
+                    rotating_mem.Print();
+
+                    auto run_flush_cache = [&]() {
+                        // flush icache
+                        ck::utility::flush_icache();
+                        // rotating mem
+                        rotating_mem.Next();
+                        // clear c mem
+                        if(arg_.KBatch > 1)
+                            hipGetErrorString(hipMemsetAsync(arg_.p_c_grid,
+                                                             0,
+                                                             arg_.M * arg_.N * sizeof(CDataType),
+                                                             stream_config.stream_id_));
+                    };
+
+                    ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                        stream_config,
+                        run_flush_cache,
+                        kernel,
+                        dim3(gdx, gdy, gdz),
+                        dim3(BlockSize),
+                        0,
+                        arg_);
+                }
+                else
+                {
+                    if(arg.KBatch > 1)
+                        hipGetErrorString(hipMemsetAsync(arg.p_c_grid,
+                                                         0,
+                                                         arg.M * arg.N * sizeof(CDataType),
+                                                         stream_config.stream_id_));
+
+                    ave_time = launch_and_time_kernel(
+                        stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);
+                }
+            };
+
+            constexpr auto estimated_reg_a = MPerBlock * KPerBlock * sizeof(ADataType) / BlockSize /
+                                             4 * (1 + GridwiseGemm::NWave);
+            constexpr auto estimated_reg_b = NPerBlock * KPerBlock * sizeof(BDataType) / BlockSize /
+                                             4 * (2) * (IsInputGemm ? 2 : 1);
+            constexpr auto estimated_reg_c = MPerBlock * NPerBlock * sizeof(GemmAccDataType) /
+                                             BlockSize / 4 * (IsInputGemm ? 2 : 1);
+            constexpr auto estimated_reg_total =
+                estimated_reg_a + estimated_reg_b + estimated_reg_c;
+
+            constexpr index_t minimum_occupancy = (estimated_reg_total >= 256) ? 1 : 2;
+
+            constexpr auto MemoryDataOp =
+                IsInputGemm ? InMemoryDataOperationEnum::Set : InMemoryDataOperationEnum::AtomicAdd;
+            if(has_main_k_block_loop)
+            {
+                // Tail number always full
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_moe_gemm<GridwiseGemm,
+                                                                true,
+                                                                MemoryDataOp,
+                                                                minimum_occupancy,
+                                                                TailNumber::Odd>;
+                            RunKernel(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_moe_gemm<GridwiseGemm,
+                                                                true,
+                                                                MemoryDataOp,
+                                                                minimum_occupancy,
+                                                                TailNumber::Even>;
+                            RunKernel(kernel);
+                        }
+                    }
+                }
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2 ||
+                                  BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                    {
+                        const auto kernel = kernel_moe_gemm_2lds<GridwiseGemm,
+                                                                 true,
+                                                                 MemoryDataOp,
+                                                                 minimum_occupancy,
+                                                                 TailNumber::Odd>;
+                        RunKernel(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_moe_gemm_2lds<GridwiseGemm,
+                                                                 true,
+                                                                 MemoryDataOp,
+                                                                 minimum_occupancy,
+                                                                 TailNumber::Even>;
+                        RunKernel(kernel);
+                    }
+                }
+                else
+                {
+                    throw std::runtime_error("todo: only v1 & v2 support now");
+                }
+            }
+#if 1
+            else
+            {
+                // Tail number always 1
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+                    const auto kernel = kernel_moe_gemm<GridwiseGemm,
+                                                        true,
+                                                        InMemoryDataOperationEnum::Set,
+                                                        minimum_occupancy,
+                                                        TailNumber::Odd>;
+                    RunKernel(kernel);
+                }
+            }
+#endif
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        // only impl kbatch 1 now
+        if(arg.KBatch > 1)
+        {
+            return false;
+        }
+        if(!ck::is_xdl_supported())
+        {
+            return false;
+        }
+
+        if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> && arg.KBatch > 1)
+        {
+            return false;
+        }
+
+        if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
+                                                       GemmSpec == GemmSpecialization::NKPadding ||
+                                                       GemmSpec == GemmSpecialization::MNKPadding ||
+                                                       GemmSpec == GemmSpecialization::KPadding))
+        {
+            return false;
+        }
+        if(arg.N % NPerBlock != 0 || arg.K % KPerBlock != 0)
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const void* p_sorted_token_ids,
+                             const void* p_sorted_expert_ids,
+                             const void* p_max_token_id,
+                             const void* p_a,
+                             const void* p_b,
+                             std::array<const void*, NumDTensor> p_ds,
+                             void* p_c,
+                             index_t NumTokens,
+                             index_t TopK,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t StrideA,
+                             index_t StrideB,
+                             std::array<index_t, NumDTensor> StrideDs,
+                             index_t StrideC,
+                             index_t KBatch,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{static_cast<const index_t*>(p_sorted_token_ids),
+                        static_cast<const index_t*>(p_sorted_expert_ids),
+                        static_cast<const index_t*>(p_max_token_id),
+                        static_cast<const ADataType*>(p_a),
+                        static_cast<const BDataType*>(p_b),
+                        p_ds,
+                        static_cast<CDataType*>(p_c),
+                        NumTokens,
+                        TopK,
+                        M,
+                        N,
+                        K,
+                        StrideA,
+                        StrideB,
+                        StrideDs,
+                        StrideC,
+                        KBatch,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      std::array<const void*, NumDTensor> p_ds,
+                                                      void* p_c,
+                                                      index_t M,
+                                                      index_t N,
+                                                      index_t K,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      std::array<ck::index_t, NumDTensor> StrideDs,
+                                                      index_t StrideC,
+                                                      index_t KBatch,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(nullptr,
+                                          nullptr,
+                                          nullptr,
+                                          static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          p_ds,
+                                          static_cast<CDataType*>(p_c),
+                                          M, // randoms set, no use
+                                          0,
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideB,
+                                          StrideDs,
+                                          StrideC,
+                                          KBatch,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<BlockGemmPipelineScheduler, std::string> BlkGemmPipelineSchedulerToString{
+            {BlockGemmPipelineScheduler::Intrawave, "Intrawave"},
+            {BlockGemmPipelineScheduler::Interwave, "Interwave"}};
+
+        std::map<BlockGemmPipelineVersion, std::string> BlkGemmPipelineVersionToString{
+            {BlockGemmPipelineVersion::v1, "v1"}, {BlockGemmPipelineVersion::v2, "v2"}};
+
+        // clang-format off
+        str << "DeviceMoeGEmm"
+            << "<"
+            << getGemmSpecializationString(GemmSpec) << ", "
+            << std::string(ALayout::name)[0]
+            << std::string(BLayout::name)[0]
+            << std::string(CLayout::name)[0]
+            << ">"
+            << " BlkSize: "
+            << BlockSize << ", "
+            << "BlkTile: "
+            << MPerBlock<<"x"<<NPerBlock<<"x"<<KPerBlock << ", "
+            << "WaveTile: "
+            << MPerXDL<<"x"<<NPerXDL << ", "
+            << "WaveMap: "
+            << MXdlPerWave<<"x" << NXdlPerWave<<", "
+            << "VmemReadVec: "
+            << ABlockTransferSrcScalarPerVector<<"x"<<BBlockTransferSrcScalarPerVector<<", "
+            << "BlkGemmPipelineScheduler: "
+            << BlkGemmPipelineSchedulerToString[BlkGemmPipeSched] << ", "
+            << "BlkGemmPipelineVersion: "
+            << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
+            << "BlkGemmPipelinePrefetchStages: "
+            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp b/include/ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp
index 7a62ec0465..df3c929c2e 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp
@@ -12,7 +12,11 @@
 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#if __clang_major__ == 20
+#include "ck/tensor_operation/gpu/grid/gridwise_sparse_embeddings_forward_layernorm_builtins.hpp"
+#else
 #include "ck/tensor_operation/gpu/grid/gridwise_sparse_embeddings_forward_layernorm.hpp"
+#endif
 
 namespace ck {
 namespace tensor_operation {
diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
index f1d0f9844d..672998d811 100644
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -79,6 +79,24 @@ __device__ inline half4_t i4_to_half4_scale(int q, const ck::half2_t& scale)
     return res.template AsType<half4_t>()[Number<0>{}];
 }
 
+__device__ inline f8x4_t i4_to_f8x4(int q)
+{
+    const int LO = 0x000f000f;
+    const int HI = 0x00f000f0;
+
+    int lo = amd_assembly_and_b32(q, LO);
+    int hi = amd_assembly_and_b32(q, HI);
+
+    float f32_0 = amd_assemble_cvt_f32_i4(lo);
+    float f32_1 = amd_assemble_cvt_f32_i4(lo >> 16);
+    float f32_2 = amd_assemble_cvt_f32_i4(hi);
+    float f32_3 = amd_assemble_cvt_f32_i4(hi >> 16);
+
+    return amd_assembly_cvt_f8_to_f32(f32_0, f32_1, f32_2, f32_3);
+}
+
+__device__ inline f8x8_t i4_to_fp8x8(int q) { return amd_assembly_i4_to_fp8x8(q); }
+
 __device__ inline bhalf4_t i4_to_bhalf4(int q)
 {
     uint32_t i8s = (q & 0xf) | ((q & 0xf0) << 4) | ((q & 0xf00) << 8) | ((q & 0xf000) << 12);
@@ -142,6 +160,55 @@ struct PassThroughPack8
 #endif
     }
 
+    __host__ __device__ constexpr void operator()(ck::f8x8_t& y, const ck::pk_i4x4_t& x) const
+    {
+#if CK_USE_PK4_LAYOUT_SHUFFLE
+        y = i4_to_fp8x8(bit_cast<int>(x));
+
+#else
+        // Added pk_i4_t to f8x2_fnuz_t conversion
+        vector_type<f8_t, 8> dst;
+        vector_type<float, 8> dst_tmp;
+        vector_type<pk_i4_t, 4> src{x};
+
+        // pk_i4_t to float2_t conversion
+        dst_tmp.template AsType<float2_t>()(Number<0>{}) =
+            type_convert<float2_t>(src.template AsType<pk_i4_t>()[Number<0>{}]);
+
+        dst_tmp.template AsType<float2_t>()(Number<1>{}) =
+            type_convert<float2_t>(src.template AsType<pk_i4_t>()[Number<1>{}]);
+
+        dst_tmp.template AsType<float2_t>()(Number<2>{}) =
+            type_convert<float2_t>(src.template AsType<pk_i4_t>()[Number<2>{}]);
+
+        dst_tmp.template AsType<float2_t>()(Number<3>{}) =
+            type_convert<float2_t>(src.template AsType<pk_i4_t>()[Number<3>{}]);
+
+        // float to f8_t conversion
+        dst.template AsType<f8_t>()(Number<0>{}) =
+            type_convert<f8_t>(dst_tmp.template AsType<float>()[Number<0>{}]);
+        dst.template AsType<f8_t>()(Number<1>{}) =
+            type_convert<f8_t>(dst_tmp.template AsType<float>()[Number<1>{}]);
+
+        dst.template AsType<f8_t>()(Number<2>{}) =
+            type_convert<f8_t>(dst_tmp.template AsType<float>()[Number<2>{}]);
+        dst.template AsType<f8_t>()(Number<3>{}) =
+            type_convert<f8_t>(dst_tmp.template AsType<float>()[Number<3>{}]);
+
+        dst.template AsType<f8_t>()(Number<4>{}) =
+            type_convert<f8_t>(dst_tmp.template AsType<float>()[Number<4>{}]);
+        dst.template AsType<f8_t>()(Number<5>{}) =
+            type_convert<f8_t>(dst_tmp.template AsType<float>()[Number<5>{}]);
+
+        dst.template AsType<f8_t>()(Number<6>{}) =
+            type_convert<f8_t>(dst_tmp.template AsType<float>()[Number<6>{}]);
+        dst.template AsType<f8_t>()(Number<7>{}) =
+            type_convert<f8_t>(dst_tmp.template AsType<float>()[Number<7>{}]);
+
+        y = dst.template AsType<f8x8_t>()[Number<0>{}];
+#endif
+    }
+
     __host__ __device__ constexpr void operator()(ck::bhalf8_t& y, const ck::pk_i4x4_t& x) const
     {
 #if CK_USE_PK4_LAYOUT_SHUFFLE
@@ -290,6 +357,12 @@ struct PassThrough
         y = type_convert<half_t>(x);
     }
 
+    template <>
+    __host__ __device__ void operator()<float, int32_t>(float& y, const int32_t& x) const
+    {
+        y = type_convert<float>(x);
+    }
+
     template <>
     __host__ __device__ void operator()<bhalf_t, bhalf_t>(bhalf_t& y, const bhalf_t& x) const
     {
diff --git a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
index 64fad1ca48..311545aad6 100644
--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
@@ -1438,6 +1438,7 @@ struct BlockToCTileMap_GemmStreamK_v2
     __host__ __device__ BlockToCTileMap_GemmStreamK_v2(
         uint32_t m, uint32_t n, uint32_t k, uint32_t grid_size = 1, uint32_t streamk_sel = 1)
     {
+
         // total output tiles
         uint32_t num_tiles =
             math::integer_divide_ceil(m, MPerBlock) * math::integer_divide_ceil(n, NPerBlock);
@@ -1445,6 +1446,9 @@ struct BlockToCTileMap_GemmStreamK_v2
 
         uint32_t dp_tiles, dp_num_blocks, sk_total_iters;
 
+        // Ensure grid_size is at least 1 to avoid division by zero
+        grid_size = math::max(grid_size, 1u);
+
         // default to regular DP GEMM if sk blocks == 0
         if(streamk_sel == 0)
         {
@@ -1460,31 +1464,45 @@ struct BlockToCTileMap_GemmStreamK_v2
         // 2-tile sk + DP GEMM
         else
         {
-
             // check if there's enough work for DP+ stream-k
             bool bigEnough = num_tiles > grid_size;
-            // select between stream-k strategies
+
+            // Select between stream-k strategies
+            // Add safety checks to prevent zero or negative values
             uint32_t sk_tiles = 0;
             if(streamk_sel == 1) // 1 tile stream-k
             {
                 sk_tiles = bigEnough ? (num_tiles % grid_size) : num_tiles;
+
+                // Ensure sk_tiles is at least 1
+                sk_tiles = math::max(sk_tiles, 1u);
             }
             else if(streamk_sel == 2) // 2-tile stream-k
             {
                 sk_tiles = bigEnough ? (grid_size + num_tiles % grid_size) : num_tiles;
+
+                // Ensure sk_tiles is at least 1 but not more than num_tiles
+                sk_tiles = math::min(math::max(sk_tiles, 1u), num_tiles);
             }
             else if(streamk_sel == 3) // 3-tile stream-k
             {
                 sk_tiles = (num_tiles > (2 * grid_size)) ? (2 * grid_size + num_tiles % grid_size)
                                                          : num_tiles;
+
+                // Ensure sk_tiles is at least 1 but not more than num_tiles
+                sk_tiles = math::min(math::max(sk_tiles, 1u), num_tiles);
             }
             else if(streamk_sel == 4) // 4-tile stream-k
             {
                 sk_tiles = (num_tiles > (3 * grid_size)) ? (3 * grid_size + num_tiles % grid_size)
                                                          : num_tiles;
+
+                // Ensure sk_tiles is at least 1 but not more than num_tiles
+                sk_tiles = math::min(math::max(sk_tiles, 1u), num_tiles);
             }
+
             sk_num_blocks = sk_tiles;
-            // remaining tiles are DP tiles
+            // Remaining tiles are DP tiles
             dp_tiles = bigEnough ? (num_tiles - sk_tiles) : 0;
 
             sk_total_iters = k_iters_per_tile.get() * sk_tiles;
@@ -1500,24 +1518,51 @@ struct BlockToCTileMap_GemmStreamK_v2
             //      => sk_blocks * m + b = sk_total_iters
             //      => b = sk_total_iters - m * sk_blocks
             //      NOTE: big could be zero
-            uint32_t k_iters_per_sk_block = sk_total_iters / sk_num_blocks;
-            sk_num_big_blocks             = sk_total_iters - k_iters_per_sk_block * sk_num_blocks;
-            k_iters_per_big_block         = k_iters_per_sk_block + 1;
+
+            // Add safety check for sk_num_blocks to prevent division by zero
+            if(sk_num_blocks > 0)
+            {
+                uint32_t k_iters_per_sk_block = sk_total_iters / sk_num_blocks;
+                sk_num_big_blocks     = sk_total_iters - k_iters_per_sk_block * sk_num_blocks;
+                k_iters_per_big_block = k_iters_per_sk_block + 1;
+            }
+            else
+            {
+                // Fallback to default GEMM if no stream-k blocks
+                sk_num_blocks         = 0;
+                sk_num_big_blocks     = 0;
+                k_iters_per_big_block = 0;
+                dp_tiles              = num_tiles;
+                dp_num_blocks         = num_tiles;
+                dp_start_block_idx    = 0;
+                sk_total_iters        = 0;
+            }
 
             dp_num_blocks      = dp_tiles;
             dp_start_block_idx = sk_num_blocks;
         }
 
         n_tiles = MDiv2(math::integer_divide_ceil(n, NPerBlock));
-        // using multiple blocks for parallel reduction
+        // Using multiple blocks for parallel reduction
         reduction_start_block_idx = dp_start_block_idx + dp_num_blocks;
 
         if constexpr(ReductionStrategy == StreamKReductionStrategy::Reduction)
         {
-            uint32_t upper_big    = math::lcm(k_iters_per_big_block, k_iters_per_tile.get());
-            uint32_t upper_little = math::lcm(k_iters_per_big_block - 1, k_iters_per_tile.get());
-            equiv_tiles_big       = MDiv(upper_big / k_iters_per_tile.get());
-            equiv_tiles_little    = MDiv(upper_little / k_iters_per_tile.get());
+            // Add additional safety checks
+            if(k_iters_per_big_block > 0 && k_iters_per_tile.get() > 0)
+            {
+                uint32_t upper_big = math::lcm(k_iters_per_big_block, k_iters_per_tile.get());
+                uint32_t upper_little =
+                    math::lcm(math::max(k_iters_per_big_block - 1, 1u), k_iters_per_tile.get());
+                equiv_tiles_big    = MDiv(upper_big / k_iters_per_tile.get());
+                equiv_tiles_little = MDiv(upper_little / k_iters_per_tile.get());
+            }
+            else
+            {
+                // Default safe values
+                equiv_tiles_big    = MDiv(1);
+                equiv_tiles_little = MDiv(1);
+            }
         }
     }
 
diff --git a/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_gemm_multiple_d_welford_first_half_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_gemm_multiple_d_welford_first_half_xdl_cshuffle.hpp
index f4d0989088..d728360c55 100644
--- a/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_gemm_multiple_d_welford_first_half_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_gemm_multiple_d_welford_first_half_xdl_cshuffle.hpp
@@ -517,8 +517,9 @@ struct GridwiseGemmMultipleDWelfordFirstHalf_xdl_cshuffle
         // sanity check
         constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
         constexpr bool is_single_rate_mfma =
-            ((is_same<ABDataType, half_t>::value || is_same<ABDataType, bhalf_t>::value) &&
-             lcm_AK1_BK1 <= 4)
+            (((is_same<ABDataType, half_t>::value || is_same<ABDataType, bhalf_t>::value) &&
+              lcm_AK1_BK1 <= 4) ||
+             (is_same<ABDataType, int8_t>::value && lcm_AK1_BK1 <= 8))
                 ? true
                 : false;
         constexpr index_t KPack =
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
index 55e254e015..50b4a734fa 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
@@ -450,8 +450,9 @@ struct GridwiseBatchedGemmGemm_Xdl_CShuffle
         // sanity check
         constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
         constexpr bool is_single_rate_mfma =
-            ((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
-             lcm_AK1_BK1 <= 4)
+            (((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
+              lcm_AK1_BK1 <= 4) ||
+             (is_same<FloatAB, int8_t>::value && lcm_AK1_BK1 <= 8))
                 ? true
                 : false;
         constexpr index_t KPack = math::max(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp
index fd16927cc1..79a9410898 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp
@@ -361,9 +361,11 @@ struct GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
         const auto M = d0_grid_desc_m_n.GetLength(I0);
         const auto N = d0_grid_desc_m_n.GetLength(I1);
 
+        constexpr auto lcm_A0K1_B0K1 = math::lcm(A0K1, B0K1);
         constexpr bool is_single_rate_mfma =
-            ((is_same<A0B0B1DataType, half_t>::value || is_same<A0B0B1DataType, bhalf_t>::value) &&
-             math::lcm(A0K1, B0K1) <= 4)
+            (((is_same<A0B0B1DataType, half_t>::value || is_same<A0B0B1DataType, bhalf_t>::value) &&
+              lcm_A0K1_B0K1 <= 4) ||
+             (is_same<A0B0B1DataType, int8_t>::value && lcm_A0K1_B0K1 <= 8))
                 ? true
                 : false;
         constexpr auto mfma = MfmaSelector<A0B0B1DataType,
@@ -653,8 +655,9 @@ struct GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
         // sanity check
         constexpr auto lcm_A0K1_B0K1 = math::lcm(A0K1, B0K1);
         constexpr bool is_single_rate_mfma =
-            ((is_same<A0B0B1DataType, half_t>::value || is_same<A0B0B1DataType, bhalf_t>::value) &&
-             lcm_A0K1_B0K1 <= 4)
+            (((is_same<A0B0B1DataType, half_t>::value || is_same<A0B0B1DataType, bhalf_t>::value) &&
+              lcm_A0K1_B0K1 <= 4) ||
+             (is_same<A0B0B1DataType, int8_t>::value && lcm_A0K1_B0K1 <= 8))
                 ? true
                 : false;
         constexpr index_t KPack =
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp
index 1f7458e68f..d15767f658 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp
@@ -343,9 +343,11 @@ struct GridwiseBatchedGemmMultipleDSoftmaxGemm_Xdl_CShuffle
         const auto M = d0_grid_desc_m_n.GetLength(I0);
         const auto N = d0_grid_desc_m_n.GetLength(I1);
 
+        constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
         constexpr bool is_single_rate_mfma =
-            ((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
-             math::lcm(AK1, BK1) <= 4)
+            (((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
+              lcm_AK1_BK1 <= 4) ||
+             (is_same<FloatAB, int8_t>::value && lcm_AK1_BK1 <= 8))
                 ? true
                 : false;
         constexpr auto mfma =
@@ -560,8 +562,9 @@ struct GridwiseBatchedGemmMultipleDSoftmaxGemm_Xdl_CShuffle
         // sanity check
         constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
         constexpr bool is_single_rate_mfma =
-            ((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
-             lcm_AK1_BK1 <= 4)
+            (((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
+              lcm_AK1_BK1 <= 4) ||
+             (is_same<FloatAB, int8_t>::value && lcm_AK1_BK1 <= 8))
                 ? true
                 : false;
         constexpr index_t KPack = math::max(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
index f7746b470f..a11d696019 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
@@ -471,8 +471,9 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
         // sanity check
         constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
         constexpr bool is_single_rate_mfma =
-            ((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
-             lcm_AK1_BK1 <= 4)
+            (((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
+              lcm_AK1_BK1 <= 4) ||
+             (is_same<FloatAB, int8_t>::value && lcm_AK1_BK1 <= 8))
                 ? true
                 : false;
         constexpr index_t KPack = math::max(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
index 2bbe4c8ce8..1326c5d62d 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -41,13 +41,16 @@ __global__ void
                                     elementwise_op);
 }
 
-template <typename GridwiseElementwiseFunctor,
+template <typename GridwiseElementwiseFunctorA,
+          typename GridwiseElementwiseFunctorB,
           typename InAGridDescTuple,
           typename InBGridDescTuple,
           typename OutAGridDescTuple,
           typename OutBGridDescTuple,
-          typename InDataTypePointerTuple,
-          typename OutDataTypePointerTuple,
+          typename InADataTypePointerTuple,
+          typename InBDataTypePointerTuple,
+          typename OutADataTypePointerTuple,
+          typename OutBDataTypePointerTuple,
           typename Block2TileMapA,
           typename Block2TileMapB,
           typename ElementwiseOperation>
@@ -55,14 +58,14 @@ __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_elementwise_dual(const InBGridDescTuple in_grid_desc_tuple_a,
+        kernel_elementwise_dual(const InAGridDescTuple in_grid_desc_tuple_a,
                                 const InBGridDescTuple in_grid_desc_tuple_b,
                                 const OutAGridDescTuple out_grid_desc_tuple_a,
                                 const OutBGridDescTuple out_grid_desc_tuple_b,
-                                const InDataTypePointerTuple p_in_global_tuple_a,
-                                const InDataTypePointerTuple p_in_global_tuple_b,
-                                const OutDataTypePointerTuple p_out_global_tuple_a,
-                                const OutDataTypePointerTuple p_out_global_tuple_b,
+                                const InADataTypePointerTuple p_in_global_tuple_a,
+                                const InBDataTypePointerTuple p_in_global_tuple_b,
+                                const OutADataTypePointerTuple p_out_global_tuple_a,
+                                const OutBDataTypePointerTuple p_out_global_tuple_b,
                                 const Block2TileMapA block_2_tile_map_a,
                                 const Block2TileMapB block_2_tile_map_b,
                                 const ElementwiseOperation elementwise_op,
@@ -70,23 +73,136 @@ __global__ void
 {
     if(get_block_1d_id() < a_grid_size)
     {
-        GridwiseElementwiseFunctor::Run(in_grid_desc_tuple_a,
-                                        out_grid_desc_tuple_a,
-                                        p_in_global_tuple_a,
-                                        p_out_global_tuple_a,
-                                        block_2_tile_map_a,
-                                        elementwise_op,
-                                        get_block_1d_id());
+        GridwiseElementwiseFunctorA::Run(in_grid_desc_tuple_a,
+                                         out_grid_desc_tuple_a,
+                                         p_in_global_tuple_a,
+                                         p_out_global_tuple_a,
+                                         block_2_tile_map_a,
+                                         elementwise_op,
+                                         get_block_1d_id());
     }
     else
     {
-        GridwiseElementwiseFunctor::Run(in_grid_desc_tuple_b,
-                                        out_grid_desc_tuple_b,
-                                        p_in_global_tuple_b,
-                                        p_out_global_tuple_b,
-                                        block_2_tile_map_b,
-                                        elementwise_op,
-                                        get_block_1d_id() - a_grid_size);
+        GridwiseElementwiseFunctorB::Run(in_grid_desc_tuple_b,
+                                         out_grid_desc_tuple_b,
+                                         p_in_global_tuple_b,
+                                         p_out_global_tuple_b,
+                                         block_2_tile_map_b,
+                                         elementwise_op,
+                                         get_block_1d_id() - a_grid_size);
+    }
+}
+
+template <typename GridwiseElementwiseFunctorA,
+          typename GridwiseElementwiseFunctorB,
+          typename InAGridDescTuple,
+          typename InBGridDescTuple,
+          typename OutAGridDescTuple,
+          typename OutBGridDescTuple,
+          typename InADataTypePointerTuple,
+          typename InBDataTypePointerTuple,
+          typename OutADataTypePointerTuple,
+          typename OutBDataTypePointerTuple,
+          typename Block2TileMapA,
+          typename Block2TileMapB,
+          typename ElementwiseOperation,
+          index_t NumInputsA,
+          index_t NumInputsB,
+          index_t NumOutputsA,
+          index_t NumOutputsB>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_elementwise_batched_dual(
+            const InAGridDescTuple in_grid_desc_tuple_a,
+            const InBGridDescTuple in_grid_desc_tuple_b,
+            const OutAGridDescTuple out_grid_desc_tuple_a,
+            const OutBGridDescTuple out_grid_desc_tuple_b,
+            const InADataTypePointerTuple p_in_global_tuple_a,
+            const InBDataTypePointerTuple p_in_global_tuple_b,
+            const OutADataTypePointerTuple p_out_global_tuple_a,
+            const OutBDataTypePointerTuple p_out_global_tuple_b,
+            const Block2TileMapA block_2_tile_map_a,
+            const Block2TileMapB block_2_tile_map_b,
+            const ElementwiseOperation elementwise_op,
+            const index_t a_grid_size,
+            const index_t batch_count_a,
+            const index_t batch_count_b,
+            const std::array<index_t, NumInputsA> input_batch_strides_a,
+            const std::array<index_t, NumInputsB> input_batch_strides_b,
+            const std::array<index_t, NumOutputsA> output_batch_strides_a,
+            const std::array<index_t, NumOutputsB> output_batch_strides_b)
+{
+    static_assert(InAGridDescTuple::Size() == NumInputsA &&
+                  InADataTypePointerTuple::Size() == NumInputsA);
+    static_assert(OutAGridDescTuple::Size() == NumOutputsA &&
+                  OutADataTypePointerTuple::Size() == NumOutputsA);
+    static_assert(InBGridDescTuple::Size() == NumInputsB &&
+                  InBDataTypePointerTuple::Size() == NumInputsB);
+    static_assert(OutBGridDescTuple::Size() == NumOutputsB &&
+                  OutBDataTypePointerTuple::Size() == NumOutputsB);
+
+    const index_t block_id = __builtin_amdgcn_readfirstlane(get_block_1d_id());
+
+    if(block_id < a_grid_size)
+    {
+        const index_t num_blocks_per_batch =
+            __builtin_amdgcn_readfirstlane(a_grid_size / batch_count_a);
+        const index_t g_idx = __builtin_amdgcn_readfirstlane(block_id / num_blocks_per_batch);
+
+        InADataTypePointerTuple p_in_global_with_offset_tuple;
+        OutADataTypePointerTuple p_out_global_with_offset_tuple;
+
+        static_for<0, InADataTypePointerTuple::Size(), 1>{}([&](auto i) {
+            p_in_global_with_offset_tuple(i) =
+                p_in_global_tuple_a.At(i) +
+                type_convert<long_index_t>(input_batch_strides_a[i]) * g_idx;
+        });
+
+        static_for<0, OutADataTypePointerTuple::Size(), 1>{}([&](auto i) {
+            p_out_global_with_offset_tuple(i) =
+                p_out_global_tuple_a.At(i) +
+                type_convert<long_index_t>(output_batch_strides_a[i]) * g_idx;
+        });
+
+        GridwiseElementwiseFunctorA::Run(in_grid_desc_tuple_a,
+                                         out_grid_desc_tuple_a,
+                                         p_in_global_with_offset_tuple,
+                                         p_out_global_with_offset_tuple,
+                                         block_2_tile_map_a,
+                                         elementwise_op,
+                                         block_id);
+    }
+    else
+    {
+        const index_t num_blocks_per_batch =
+            __builtin_amdgcn_readfirstlane((get_grid_size() - a_grid_size) / batch_count_b);
+        const index_t g_idx =
+            __builtin_amdgcn_readfirstlane((block_id - a_grid_size) / num_blocks_per_batch);
+
+        InBDataTypePointerTuple p_in_global_with_offset_tuple;
+        OutBDataTypePointerTuple p_out_global_with_offset_tuple;
+
+        static_for<0, InBDataTypePointerTuple::Size(), 1>{}([&](auto i) {
+            p_in_global_with_offset_tuple(i) =
+                p_in_global_tuple_b.At(i) +
+                type_convert<long_index_t>(input_batch_strides_b[i]) * g_idx;
+        });
+
+        static_for<0, OutBDataTypePointerTuple::Size(), 1>{}([&](auto i) {
+            p_out_global_with_offset_tuple(i) =
+                p_out_global_tuple_b.At(i) +
+                type_convert<long_index_t>(output_batch_strides_b[i]) * g_idx;
+        });
+
+        GridwiseElementwiseFunctorB::Run(in_grid_desc_tuple_b,
+                                         out_grid_desc_tuple_b,
+                                         p_in_global_with_offset_tuple,
+                                         p_out_global_with_offset_tuple,
+                                         block_2_tile_map_b,
+                                         elementwise_op,
+                                         block_id - a_grid_size);
     }
 }
 
@@ -126,12 +242,13 @@ __global__ void
     OutDataTypePointerTuple p_out_global_with_offset_tuple;
 
     static_for<0, InDataTypePointerTuple::Size(), 1>{}([&](auto i) {
-        p_in_global_with_offset_tuple(i) = p_in_global_tuple.At(i) + input_batch_strides[i] * g_idx;
+        p_in_global_with_offset_tuple(i) =
+            p_in_global_tuple.At(i) + type_convert<long_index_t>(input_batch_strides[i]) * g_idx;
     });
 
     static_for<0, OutDataTypePointerTuple::Size(), 1>{}([&](auto i) {
         p_out_global_with_offset_tuple(i) =
-            p_out_global_tuple.At(i) + output_batch_strides[i] * g_idx;
+            p_out_global_tuple.At(i) + type_convert<long_index_t>(output_batch_strides[i]) * g_idx;
     });
 
     GridwiseElementwiseFunctor::Run(in_grid_desc_tuple,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
index 8b3f51b9b0..ab97a940a8 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
@@ -500,8 +500,9 @@ struct GridwiseGemmBiasAddReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
         // sanity check
         constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
         constexpr bool is_single_rate_mfma =
-            ((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
-             lcm_AK1_BK1 <= 4)
+            (((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
+              lcm_AK1_BK1 <= 4) ||
+             (is_same<FloatAB, int8_t>::value && lcm_AK1_BK1 <= 8))
                 ? true
                 : false;
         constexpr index_t KPack = math::max(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp
index 344656b13f..79ab3acd92 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp
@@ -674,10 +674,22 @@ struct GridwiseGemmMultipleABD_xdl_cshuffle
         //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
         //       register
         // sanity check
-        constexpr index_t KPack = math::max(
-            math::lcm(AK1, BK1),
-            MfmaSelector<AComputeDataType, MPerXdl, NPerXdl, BComputeDataType>::selected_mfma
-                .k_per_blk);
+        constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
+        constexpr bool is_single_rate_mfma =
+            (((is_same<AComputeDataType, half_t>::value ||
+               is_same<AComputeDataType, bhalf_t>::value) &&
+              lcm_AK1_BK1 <= 4) ||
+             (is_same<AComputeDataType, int8_t>::value && lcm_AK1_BK1 <= 8))
+                ? true
+                : false;
+
+        constexpr index_t KPack =
+            math::max(lcm_AK1_BK1,
+                      MfmaSelector<AComputeDataType,
+                                   MPerXdl,
+                                   NPerXdl,
+                                   BComputeDataType,
+                                   is_single_rate_mfma>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
index 60ee78528d..0e51c6904c 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -466,8 +466,9 @@ struct GridwiseGemmMultipleDMultipleR_k0mk1_k0nk1_mn_xdl_cshuffle_v1
         // sanity check
         constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
         constexpr bool is_single_rate_mfma =
-            ((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
-             lcm_AK1_BK1 <= 4)
+            (((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
+              lcm_AK1_BK1 <= 4) ||
+             (is_same<FloatAB, int8_t>::value && lcm_AK1_BK1 <= 8))
                 ? true
                 : false;
         constexpr index_t KPack = math::max(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
index 060f6d5d15..d54a00eaa2 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
@@ -635,10 +635,22 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
         //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
         //       register
         // sanity check
-        constexpr index_t KPack = math::max(
-            math::lcm(AK1, BK1),
-            MfmaSelector<AComputeDataType, MPerXdl, NPerXdl, BComputeDataType>::selected_mfma
-                .k_per_blk);
+        constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
+        constexpr bool is_single_rate_mfma =
+            (((is_same<AComputeDataType, half_t>::value ||
+               is_same<AComputeDataType, bhalf_t>::value) &&
+              lcm_AK1_BK1 <= 4) ||
+             (is_same<AComputeDataType, int8_t>::value && lcm_AK1_BK1 <= 8))
+                ? true
+                : false;
+
+        constexpr index_t KPack =
+            math::max(lcm_AK1_BK1,
+                      MfmaSelector<AComputeDataType,
+                                   MPerXdl,
+                                   NPerXdl,
+                                   BComputeDataType,
+                                   is_single_rate_mfma>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp
index b4c5d004c4..57b9b02548 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp
@@ -600,10 +600,22 @@ struct GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
         //     b_mtx[K0PerBlock, NPerBlock] is in LDS
         //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
         //       register
-        constexpr index_t KPack = math::max(
-            math::lcm(AK1, BK1),
-            MfmaSelector<AComputeDataType, MPerXdl, NPerXdl, BComputeDataType>::selected_mfma
-                .k_per_blk);
+        constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
+        constexpr bool is_single_rate_mfma =
+            (((is_same<AComputeDataType, half_t>::value ||
+               is_same<AComputeDataType, bhalf_t>::value) &&
+              lcm_AK1_BK1 <= 4) ||
+             (is_same<AComputeDataType, int8_t>::value && lcm_AK1_BK1 <= 8))
+                ? true
+                : false;
+
+        constexpr index_t KPack =
+            math::max(lcm_AK1_BK1,
+                      MfmaSelector<AComputeDataType,
+                                   MPerXdl,
+                                   NPerXdl,
+                                   BComputeDataType,
+                                   is_single_rate_mfma>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp
index d1d97da5b0..88d6be234c 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp
@@ -601,8 +601,9 @@ struct GridwiseGemmMultipleD_xdl_splitk_cshuffle
         // sanity check
         constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
         constexpr bool is_single_rate_mfma =
-            ((is_same<AComputeType, half_t>::value || is_same<AComputeType, bhalf_t>::value) &&
-             lcm_AK1_BK1 <= 4)
+            (((is_same<AComputeType, half_t>::value || is_same<AComputeType, bhalf_t>::value) &&
+              lcm_AK1_BK1 <= 4) ||
+             (is_same<AComputeType, int8_t>::value && lcm_AK1_BK1 <= 8))
                 ? true
                 : false;
         constexpr index_t KPack = math::max(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
index 7105fa7012..56581256dc 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
@@ -453,8 +453,9 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
         // sanity check
         constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
         constexpr bool is_single_rate_mfma =
-            ((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
-             lcm_AK1_BK1 <= 4)
+            (((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
+              lcm_AK1_BK1 <= 4) ||
+             (is_same<FloatAB, int8_t>::value && lcm_AK1_BK1 <= 8))
                 ? true
                 : false;
         constexpr index_t KPack = math::max(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp
index 3429c20e73..23b4aec3b0 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp
@@ -583,8 +583,9 @@ struct GridwiseGemmSplitKMultipleD_xdl_cshuffle
         // sanity check
         constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
         constexpr bool is_single_rate_mfma =
-            ((is_same<ABDataType, half_t>::value || is_same<ABDataType, bhalf_t>::value) &&
-             lcm_AK1_BK1 <= 4)
+            (((is_same<ABDataType, half_t>::value || is_same<ABDataType, bhalf_t>::value) &&
+              lcm_AK1_BK1 <= 4) ||
+             (is_same<ABDataType, int8_t>::value && lcm_AK1_BK1 <= 8))
                 ? true
                 : false;
         constexpr index_t KPack =
@@ -1015,8 +1016,9 @@ struct GridwiseGemmSplitKMultipleD_xdl_cshuffle
         // sanity check
         constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
         constexpr bool is_single_rate_mfma =
-            ((is_same<ABDataType, half_t>::value || is_same<ABDataType, bhalf_t>::value) &&
-             lcm_AK1_BK1 <= 4)
+            (((is_same<ABDataType, half_t>::value || is_same<ABDataType, bhalf_t>::value) &&
+              lcm_AK1_BK1 <= 4) ||
+             (is_same<ABDataType, int8_t>::value && lcm_AK1_BK1 <= 8))
                 ? true
                 : false;
         constexpr index_t KPack =
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle_v2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle_v2.hpp
index d7c87a170c..44c1e936bd 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle_v2.hpp
@@ -597,8 +597,9 @@ struct GridwiseGemmMultipleD_xdl_splitk_cshuffle
         // sanity check
         constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
         constexpr bool is_single_rate_mfma =
-            ((is_same<ComputeType, half_t>::value || is_same<ComputeType, bhalf_t>::value) &&
-             lcm_AK1_BK1 <= 4)
+            (((is_same<ComputeType, half_t>::value || is_same<ComputeType, bhalf_t>::value) &&
+              lcm_AK1_BK1 <= 4) ||
+             (is_same<ComputeType, int8_t>::value && lcm_AK1_BK1 <= 8))
                 ? true
                 : false;
         constexpr index_t KPack = math::max(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp
new file mode 100644
index 0000000000..4dfa472103
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp
@@ -0,0 +1,1725 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/env.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmma_selector.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Full>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+        kernel_gemm_wmma_cshuffle_v3(typename GridwiseGemm::Argument karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
+#if defined(__gfx11__)
+    // gfx11 does not support *_atomic_pk_add_f16/bf16 instructions
+    using c_data_type = remove_cvref_t<remove_pointer_t<decltype(karg.p_c_grid)>>;
+    if constexpr(!(CGlobalMemoryDataOperation == InMemoryDataOperationEnum::AtomicAdd &&
+                   (std::is_same_v<c_data_type, ck::half_t> ||
+                    std::is_same_v<c_data_type, ck::bhalf_t>)))
+    {
+#endif
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
+
+        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
+            p_shared,
+            karg);
+#if defined(__gfx11__)
+    }
+#endif
+#else
+    ignore = karg;
+#endif
+}
+
+/// @brief \"Universal\" GEMM kernel with SplitK support.
+///
+/// @par Overview
+///         This GEMM kernel is carrying out following mathematical equation:
+///         C{M,N} = C_op(A_op(A{M,K}) * B_op(B{K,N}))
+///         Where A, B are input tensors and C is the output tensor. The A/B/C_op are
+///         elementwise operations that could be applied on each tensor respectively.
+///         The \"universal\" gemm comes with multiple pipelines optimized for different usage
+///         scenarios. That's why it's called \"universal\". It's universal through it's design
+///         and versatilty.
+///
+/// @note   This Kernel implementation supports SplitK algorithm. It can be configured
+///         to split the dot product accumulated over the K dimension into multiple working groups.
+///         The partial products of different workgroups are then reduced using the AtomicAdd
+///         operation.
+///
+/// @tparam ALayout     A tensor data layout.
+/// @tparam BLayout     B tensor data layout.
+/// @tparam CLayout     C tensor data layout.
+/// @tparam ADataType   A tensor data type.
+/// @tparam BDataType   B tensor data type.
+/// @tparam AccDataType The accumulation data type related to the hardware
+///                         matrix-multiplication instruction.
+/// @tparam CShuffleDataType The data type used to store matrix-multiplication results into
+///                          LDS memory during \"CShuffle\" data layout optimization.
+/// @tparam CDataType   C tensor data type.
+/// @tparam AElementwiseOperation Elementwise operation applied to the A input tensor elements.
+/// @tparam BElementwiseOperation Elementwise operation applied to the B input tensor elements.
+/// @tparam CElementwiseOperation Elementwise operation applied to the C output tensor
+///                               (after GEMM).
+/// @tparam GemmSpec    Determines used "padding" version.
+/// @tparam BlockSize   The number of threads within workgroup.
+/// @tparam MPerBlock   The input/output data tile size in the M dimension.
+/// @tparam NPerBlock   The input/output data tile size in the N dimension.
+/// @tparam KPerBlock   The input data tile size in the K dimension.
+/// @tparam AK1Value    The vector load size from global memory for A tensor.
+/// @tparam BK1Value    The vector load size from global memory for B tensor.
+/// @tparam MPerWmma    M size of Wave Matrix Multiply Accumulate (WMMA) instruction.
+/// @tparam NPerWmma    N size of Wave Matrix Multiply Accumulate (WMMA) instruction.
+/// @tparam MRepeat     The number of iterations in the M dimension over output tile per wavefront.
+/// @tparam NRepeat     The number of iterations in the N dimension over output tile per wavefront.
+/// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question, "How many threads can be
+///                                                      arranged on each input data axis?"
+/// @tparam ABlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam ABlockTransferSrcVectorDim   The index of axis on which we could do vectorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam ABlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam ABlockTransferDstScalarPerVector_AK1 The size of vectorized store into LDS memory.
+/// @tparam AThreadTransferSrcResetCoordinateAfterRun   Decides whether we reset thread coordinate
+///                          (return back to the window origin) after all thread finish data copy.
+/// @tparam ABlockLdsExtraM                      Whether to use padding for LDS or not. With
+///                                              universal GEMM there's no need for padding.
+/// @tparam BBlockTransferThreadClusterLengths_BK0_N_BK1 Spatial thread distribution over the input
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question: "How many threads to
+///                                                      arrange on each input data axis?"
+/// @tparam BBlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam BBlockTransferSrcAccessOrder he order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam BBlockTransferSrcVectorDim  The index of axis on which we could do vectorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam BBlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam BBlockTransferDstScalarPerVector_BK1 The size of vectorized store into LDS memory.
+/// @tparam BThreadTransferSrcResetCoordinateAfterRun   Decides whether we reset thread coordinate
+///                          (return back to the window origin) after all thread finish data copy.
+/// @tparam BBlockLdsExtraN             Whether to use padding for LDS or not. With universal GEMM
+///                                         there's no need for padding.
+/// @tparam CShuffleMRepeatPerShuffle   The number of matrix-multiplication instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in M dimension.
+/// @tparam CShuffleNRepeatPerShuffle   The number of matrix-multiplication instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in N dimension.
+/// @tparam CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock The spatial
+///                                         thread distribution used for storing data into output
+///                                         tensor across output data layout dimensions.
+/// @tparam CShuffleBlockTransferScalarPerVector_NPerBlock The size of vectorized memory access.
+///                                         Used when storing data to output tensor.
+/// @tparam BlkGemmPipeSched    The version of blockwise-gemm pipeline scheduler (interwave or
+///                             intrawave).
+/// @tparam BlkGemmPipelineVer  The version of blockwise-gemm pipeline.
+/// @tparam ComputeTypeA    Data type used for A input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam ComputeTypeB    Data type used for B input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam PermuteA            Whether the A input tensor has gridwise-gemm friendly data layout
+///                             in global memory. Currently not supported!
+/// @tparam PermuteB            Whether the B input tensor has gridwise-gemm friendly data layout
+///                             in global memory (pre-shuffled).
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          tensor_operation::device::GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1Value,
+          index_t BK1Value,
+          index_t MPerWmma,
+          index_t NPerWmma,
+          index_t MRepeat,
+          index_t NRepeat,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMRepeatPerShuffle,
+          index_t CShuffleNRepeatPerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4,
+          typename ComputeTypeA                       = CDataType,
+          typename ComputeTypeB                       = ComputeTypeA,
+          bool PermuteA                               = false,
+          bool PermuteB                               = false>
+struct GridwiseGemm_wmma_cshuffle_v3
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    // K1 should be Number<...>
+    static constexpr auto AK0Number = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0Number = Number<KPerBlock / BK1Value>{};
+    static constexpr auto AK1Number = Number<AK1Value>{};
+    static constexpr auto BK1Number = Number<BK1Value>{};
+
+    static constexpr index_t KPack = math::max(
+        math::lcm(AK1Number, BK1Number),
+        WmmaSelector<ComputeTypeA, ComputeTypeB, AccDataType, MPerWmma, NPerWmma>::selected_wmma
+            .k_per_wmma);
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    static constexpr index_t APackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    static constexpr index_t BPackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    __host__ static auto CalculateGridSize(index_t M, index_t N, index_t KBatch)
+    {
+        return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, KBatch);
+    }
+
+    __host__ static auto CalculateMPadded(index_t M)
+    {
+        return math::integer_least_multiple(M, MPerBlock);
+    }
+
+    __host__ static auto CalculateNPadded(index_t N)
+    {
+        return math::integer_least_multiple(N, NPerBlock);
+    }
+
+    __host__ static auto CalculateKPadded(index_t K)
+    {
+        return math::integer_divide_ceil(K, KPerBlock) * KPerBlock;
+    }
+
+    __host__ static auto CalculateAK0Padded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * (KPerBlock / AK1Value);
+    }
+
+    __host__ static auto CalculateBK0Padded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * (KPerBlock / BK1Value);
+    }
+
+    __host__ static auto CalculateKPadded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * KPerBlock;
+    }
+
+    __host__ static auto CalculateKRead(index_t K, index_t K_Batch = 1)
+    {
+        constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
+        auto K_t                = K_Batch * KReadVec;
+        return (K + K_t - 1) / K_t * KReadVec;
+    }
+
+    __host__ static auto CalculateMBlock(index_t M)
+    {
+        return math::integer_divide_ceil(M, MPerBlock);
+    }
+
+    __host__ static auto CalculateNBlock(index_t N)
+    {
+        return math::integer_divide_ceil(N, NPerBlock);
+    }
+
+    template <index_t MNRepeat, index_t MNWaves, index_t MNPerWmma, typename BlockDesc>
+    __host__ __device__ static constexpr auto MakeWmmaTileDescriptor(const BlockDesc&)
+    {
+        // K0_N_K1 -> K0_MNRepeat_MNWaves_MNPerWmma_K1
+        constexpr auto K0 = BlockDesc{}.GetLength(I0);
+        constexpr auto K1 = BlockDesc{}.GetLength(I2);
+#ifdef __gfx12__
+        constexpr auto KRow = I2;
+#else
+        constexpr auto KRow = I1;
+#endif
+        return transform_tensor_descriptor(
+            BlockDesc{},
+            make_tuple(make_unmerge_transform(make_tuple(Number<K0 / KRow>{}, KRow)),
+                       make_unmerge_transform(
+                           make_tuple(Number<MNRepeat>{}, Number<MNWaves>{}, Number<MNPerWmma>{})),
+                       make_pass_through_transform(Number<K1>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0, 3>{}, Sequence<1, 2, 4>{}, Sequence<5>{}));
+    }
+
+    __host__ __device__ static auto MakeAGridDescriptor_AK0_M_AK1(
+        index_t M, index_t MPad, index_t K, index_t KPad, index_t StrideA, index_t AK0)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both M and K
+            const auto a_grid_desc_m_k =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_right_pad_transform(M, MPad - M),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(MPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad M, but not K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_right_pad_transform(M, MPad - M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad K, but not M
+            const auto a_grid_desc_m_k = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else
+        {
+            static_assert(!PermuteA, "PermuteA is not supported");
+
+            // not pad M or K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+    }
+
+    __host__ __device__ static auto MakeBGridDescriptor_BK0_N_BK1(
+        index_t K, index_t KPad, index_t N, index_t NPad, index_t StrideB, index_t BK0)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(StrideB, I1));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        static_assert(!(is_same_v<remove_cvref_t<ADataType>, pk_i4_t> &&
+                        GemmSpec != GemmSpecialization::Default),
+                      "pk_i4_t does not support padding");
+
+        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both N and K
+            const auto b_grid_desc_n_k =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_right_pad_transform(N, NPad - N),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(NPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad N, but not K
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad K, but not N
+            const auto b_grid_desc_n_k = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_pass_through_transform(N), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else
+        {
+            if constexpr(!PermuteB)
+            {
+                // not pad N or K
+                const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                    b_grid_desc_nraw_kraw,
+                    make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                               make_pass_through_transform(N)),
+                    make_tuple(Sequence<1>{}, Sequence<0>{}),
+                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+                return b_grid_desc_bk0_n_bk1;
+            }
+            else
+            {
+                // Pre-shuffled Weight
+                // BGlobal[K / KPerBlock, N, KPerBlock / K1, K1] -> BTile[K / K1, N, K1]
+                constexpr index_t BK01 = KPerBlock / BK1Value;
+                const index_t BK0_     = StrideB / BK1Value;
+                const index_t BK00     = BK0_ / BK01;
+
+                const auto b_grid_desc_bk00_n_bk01_bk1_permute =
+                    make_naive_tensor_descriptor_packed(make_tuple(BK00, N, BK01, BK1Value));
+
+                const auto b_grid_desc_bk0_n_bk1_permute = transform_tensor_descriptor(
+                    b_grid_desc_bk00_n_bk01_bk1_permute,
+                    make_tuple(make_merge_transform(make_tuple(BK00, BK01)),
+                               make_pass_through_transform(make_tuple(N)),
+                               make_pass_through_transform(BK1Value)),
+                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+                return b_grid_desc_bk0_n_bk1_permute;
+            }
+        }
+    }
+
+    template <typename ABlockDesc_AK0_M_AK1>
+    __host__ __device__ static constexpr auto MakeAWmmaTileDescriptor(const ABlockDesc_AK0_M_AK1&)
+    {
+        constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWmma);
+
+        return MakeWmmaTileDescriptor<MRepeat, MWaves, MPerWmma>(ABlockDesc_AK0_M_AK1{});
+    }
+
+    template <typename BBlockDesc_BK0_N_BK1>
+    __host__ __device__ static constexpr auto MakeBWmmaTileDescriptor(const BBlockDesc_BK0_N_BK1&)
+    {
+        constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWmma);
+
+        return MakeWmmaTileDescriptor<NRepeat, NWaves, NPerWmma>(BBlockDesc_BK0_N_BK1{});
+    }
+
+    __host__ __device__ static auto
+    MakeCGridDescriptor_M_N(index_t M, index_t MPad, index_t N, index_t NPad, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        // pad M and N
+        return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                           make_tuple(make_right_pad_transform(M, MPad - M),
+                                                      make_right_pad_transform(N, NPad - N)),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}));
+        // TODO: Investigate why this path is not used in the original
+        // gridwise_gemm_xdl_cshuffle_v3.hpp
+#if 0
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad M and N
+            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                               make_tuple(make_right_pad_transform(M, MPad - M),
+                                                          make_right_pad_transform(N, NPad - N)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad M, but not N
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad N, but not M
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            // not pad M or N
+            return c_grid_desc_mraw_nraw;
+        }
+#endif
+    }
+
+    struct Problem
+    {
+        __host__ Problem(index_t M_,
+                         index_t N_,
+                         index_t K_,
+                         index_t StrideA_,
+                         index_t StrideB_,
+                         index_t StrideC_,
+                         index_t KBatch_)
+            : M{M_},
+              N{N_},
+              K{K_},
+              StrideA{StrideA_},
+              StrideB{StrideB_},
+              StrideC{StrideC_},
+              KBatch{KBatch_},
+              MPadded{CalculateMPadded(M_)},
+              NPadded{CalculateNPadded(N_)},
+              KRead{CalculateKRead(K_, KBatch_)},
+              KPadded{CalculateKPadded(K_, KBatch_)},
+              AK0{CalculateAK0Padded(K_, KBatch_)},
+              BK0{CalculateBK0Padded(K_, KBatch_)},
+              MBlock{CalculateMBlock(M_)},
+              NBlock{CalculateNBlock(N_)}
+        {
+        }
+
+        __host__ void Print() const
+        {
+            std::cout << "problem {"
+                      << "M:" << M << ", "
+                      << "N:" << N << ", "
+                      << "K:" << K << ", "
+                      << "SA:" << StrideA << ", "
+                      << "SB:" << StrideB << ", "
+                      << "SC:" << StrideC << ", "
+                      << "MP:" << MPadded << ", "
+                      << "NP:" << NPadded << ", "
+                      << "KRead:" << KRead << ", "
+                      << "KP:" << KPadded << ", "
+                      << "AK0:" << AK0 << ", "
+                      << "BK0:" << BK0 << ", "
+                      << "MBlock: " << MBlock << ", "
+                      << "NBlock: " << NBlock << "}" << std::endl;
+        }
+
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t StrideA;
+        index_t StrideB;
+        index_t StrideC;
+        index_t KBatch;
+        index_t MPadded;
+        index_t NPadded;
+        index_t KRead;
+        index_t KPadded;
+        index_t AK0;
+        index_t BK0;
+        index_t MBlock;
+        index_t NBlock;
+    };
+
+    // Argument
+    struct Argument : public tensor_operation::device::BaseArgument, public Problem
+    {
+        __host__ Argument(const ADataType* p_a_grid_,
+                          const BDataType* p_b_grid_,
+                          CDataType* p_c_grid_,
+                          index_t M_,
+                          index_t N_,
+                          index_t K_,
+                          index_t StrideA_,
+                          index_t StrideB_,
+                          index_t StrideC_,
+                          index_t k_batch_,
+                          bool is_reduce_ = false)
+            : Problem{M_, N_, K_, StrideA_, StrideB_, StrideC_, k_batch_},
+              p_a_grid{p_a_grid_},
+              p_b_grid{p_b_grid_},
+              p_c_grid{p_c_grid_},
+              is_reduce(is_reduce_)
+        {
+        }
+
+        __host__ __device__ inline bool IsReduceAdd() const
+        {
+            return (Problem::KBatch > 1) && is_reduce;
+        }
+
+        __host__ __device__ inline bool IsAtomicAdd() const
+        {
+            return (Problem::KBatch > 1) && (!is_reduce);
+        }
+
+        const ADataType* p_a_grid;
+        const BDataType* p_b_grid;
+        CDataType* p_c_grid;
+        bool is_reduce;
+    };
+
+    struct SplitKBatchOffset
+    {
+
+        __device__ SplitKBatchOffset(Argument& karg)
+        {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                a_k_split_offset = blockIdx.z * karg.KRead / APackedSize;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                a_k_split_offset = blockIdx.z * karg.KRead * karg.StrideA;
+            }
+
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
+            {
+                b_k_split_offset = blockIdx.z * karg.KRead * karg.StrideB;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
+            {
+                if constexpr(!PermuteB)
+                {
+                    b_k_split_offset = blockIdx.z * karg.KRead / BPackedSize;
+                }
+                else
+                {
+                    const int k0_offset = karg.KRead * karg.N;
+                    b_k_split_offset    = blockIdx.z * k0_offset / BPackedSize;
+                }
+            }
+
+            if(blockIdx.z < static_cast<uint32_t>(karg.KBatch - 1))
+            {
+                karg.K = karg.KRead;
+            }
+            else
+            {
+                karg.K = karg.K - karg.KRead * (karg.KBatch - 1);
+            }
+
+            if(karg.IsReduceAdd())
+            {
+                c_reduce_offset = blockIdx.z * karg.M * karg.N;
+            }
+            else
+            {
+                c_reduce_offset = 0;
+            }
+        }
+
+        index_t a_k_split_offset;
+        index_t b_k_split_offset;
+        index_t c_reduce_offset;
+    };
+
+    __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    {
+        // A matrix in LDS memory, dst of blockwise copy
+        if constexpr(ABlockLdsExtraM || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+        {
+            // bank conflict when writting the data into LDS, but don't worry, we have whole entire
+            // loop to hide it in v4. it may give you some benefit from less valu in compute address
+            return make_naive_tensor_descriptor(
+                make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
+                make_tuple(Number<MPerBlock>{} * AK1Number, AK1Number, I1));
+        }
+        // xor tensor transformation request more unnecessary vgpr usage, would cause register spill
+        // in some cases.
+        else if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            constexpr index_t LdsSize       = 32 * 4 / KPerBlock / sizeof(ADataType) / APackedSize;
+            constexpr auto MLdsLayer        = LdsSize < 1 ? 1 : LdsSize;
+            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor(
+                make_tuple(
+                    AK0Number * Number<MLdsLayer>{}, Number<MPerBlock / MLdsLayer>{}, AK1Number),
+                make_tuple(AK1Number, Number<KPerBlock * MLdsLayer>{}, I1));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(make_xor_with_modulo_transform(make_tuple(
+                               Number<MPerBlock / MLdsLayer>{}, Number<AK0Number * MLdsLayer>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
+
+            constexpr auto a_lds_block_desc_ak0_mldslayer_m_ak1 = transform_tensor_descriptor(
+                a_lds_block_desc_permuted,
+                make_tuple(make_unmerge_transform(make_tuple(AK0Number, Number<MLdsLayer>{})),
+                           make_pass_through_transform(Number<MPerBlock / MLdsLayer>{}),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{}));
+
+            constexpr auto a_lds_block_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_lds_block_desc_ak0_mldslayer_m_ak1,
+                make_tuple(make_pass_through_transform(AK0Number),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<MPerBlock / MLdsLayer>{}, Number<MLdsLayer>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return a_lds_block_desc_ak0_m_ak1;
+        }
+        else // ColumnMajor A
+        {
+            // kfold and mpair dimension is not always required.
+            // more dimension in merge_transform increase the difficulty of generating immarg offset
+            // for compiler.
+            constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+            constexpr auto M1 = MPerBlock / M0;
+
+            constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
+            constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
+            constexpr auto KThreadRead      = 64 / MPerWmma;
+            constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
+
+            constexpr auto kfold = (AK1Number * M0 * sizeof(ADataType) > 128)
+                                       ? 1
+                                       : 128 / (AK1Number * M0 * sizeof(ADataType));
+            constexpr auto KThreadReadPerm =
+                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
+                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
+                    : KThreadRead;
+
+            // 1<=mpair<=n0
+            constexpr auto mpair = (AK1Number * MPerWmma * sizeof(ADataType) > 128)
+                                       ? 1
+                                       : ((128 / (AK1Number * MPerWmma * sizeof(ADataType))) > M0
+                                              ? M0
+                                              : 128 / (AK1Number * MPerWmma * sizeof(ADataType)));
+
+            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor_packed(
+                make_tuple(Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                           Number<K0PerThreadWrite>{},
+                           Number<KThreadReadPerm * M1>{},
+                           Number<kfold * M0 / mpair>{},
+                           Number<mpair>{},
+                           AK1Number));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_xor_with_modulo_transform(
+                        make_tuple(Number<KThreadReadPerm * M1>{}, Number<kfold * M0 / mpair>{})),
+                    make_pass_through_transform(Number<mpair>{}),
+                    make_pass_through_transform(AK1Number)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}));
+
+            constexpr auto a_lds_block_desc_unmerged = transform_tensor_descriptor(
+                a_lds_block_desc_permuted,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_unmerge_transform(make_tuple(Number<KThreadReadPerm>{}, Number<M1>{})),
+                    make_unmerge_transform(make_tuple(Number<kfold>{}, Number<M0 / mpair>{})),
+                    make_pass_through_transform(Number<mpair>{}),
+                    make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0>{},
+                           Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<5>{}),
+                make_tuple(Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<0, 3>{},
+                           Sequence<4, 5>{},
+                           Sequence<6>{},
+                           Sequence<7>{}));
+
+            constexpr auto a_lds_block_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_lds_block_desc_unmerged,
+                make_tuple(make_merge_transform_v3_division_mod(
+                               make_tuple(Number<KThreadReadPerm>{},
+                                          Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                                          Number<kfold>{},
+                                          Number<K0PerThreadWrite>{})),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<M0 / mpair>{}, Number<mpair>{}, Number<M1>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0, 1, 4, 2>{}, Sequence<5, 6, 3>{}, Sequence<7>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return a_lds_block_desc_ak0_m_ak1;
+        }
+    }
+
+    __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // B matrix in LDS memory, dst of blockwise copy
+        if constexpr(BBlockLdsExtraN || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+        {
+            // bank conflict when writting the data into LDS, but don't worry, we have whole entire
+            // loop to hide it in v4. it may give you some benefit from less valu in compute address
+            return make_naive_tensor_descriptor(
+                make_tuple(BK0Number, Number<NPerBlock>{}, BK1Number),
+                make_tuple(Number<NPerBlock + BBlockLdsExtraN>{} * BK1Number, BK1Number, I1));
+        }
+        else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+        {
+            // NLdsLayer * K0 as logical Bank
+            constexpr index_t LdsSize       = 32 * 4 / KPerBlock / sizeof(BDataType) / BPackedSize;
+            constexpr index_t NLdsLayer     = LdsSize < 1 ? 1 : LdsSize;
+            constexpr auto b_lds_block_desc = make_naive_tensor_descriptor(
+                make_tuple(
+                    BK0Number * Number<NLdsLayer>{}, Number<NPerBlock / NLdsLayer>{}, BK1Number),
+                make_tuple(BK1Number, Number<KPerBlock * NLdsLayer>{}, I1));
+
+            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
+                b_lds_block_desc,
+                make_tuple(make_xor_with_modulo_transform(make_tuple(
+                               Number<NPerBlock / NLdsLayer>{}, Number<BK0Number * NLdsLayer>{})),
+                           make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
+
+            constexpr auto b_lds_block_desc_bk0_nldslayer_n_bk1 = transform_tensor_descriptor(
+                b_lds_block_desc_permuted,
+                make_tuple(make_unmerge_transform(make_tuple(BK0Number, Number<NLdsLayer>{})),
+                           make_pass_through_transform(Number<NPerBlock / NLdsLayer>{}),
+                           make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{}));
+
+            constexpr auto b_lds_block_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_lds_block_desc_bk0_nldslayer_n_bk1,
+                make_tuple(make_pass_through_transform(BK0Number),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<NPerBlock / NLdsLayer>{}, Number<NLdsLayer>{})),
+                           make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return b_lds_block_desc_bk0_n_bk1;
+        }
+        else // RowMajor B
+        {
+            constexpr auto N0 = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I1);
+            constexpr auto N1 = NPerBlock / N0;
+
+            constexpr auto KThreadWrite     = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I0);
+            constexpr auto K0PerThreadWrite = BK0Number / KThreadWrite;
+            constexpr auto KThreadRead      = 64 / NPerWmma;
+            constexpr auto K0PerThreadRead  = BK0Number / KThreadRead;
+
+            constexpr auto kfold = (BK1Number * N0 * sizeof(BDataType) > 128)
+                                       ? 1
+                                       : 128 / (BK1Number * N0 * sizeof(BDataType));
+            constexpr auto KThreadReadPerm =
+                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
+                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
+                    : KThreadRead;
+
+            // 1<=npair<=n0
+            constexpr auto npair = (BK1Number * NPerWmma * sizeof(BDataType) > 128)
+                                       ? 1
+                                       : ((128 / (BK1Number * NPerWmma * sizeof(BDataType))) > N0
+                                              ? N0
+                                              : 128 / (BK1Number * NPerWmma * sizeof(BDataType)));
+
+            constexpr auto b_lds_block_desc = make_naive_tensor_descriptor_packed(
+                make_tuple(Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                           Number<K0PerThreadWrite>{},
+                           Number<KThreadReadPerm * N1>{},
+                           Number<kfold * N0 / npair>{},
+                           Number<npair>{},
+                           BK1Number));
+
+            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
+                b_lds_block_desc,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_xor_with_modulo_transform(
+                        make_tuple(Number<KThreadReadPerm * N1>{}, Number<kfold * N0 / npair>{})),
+                    make_pass_through_transform(Number<npair>{}),
+                    make_pass_through_transform(BK1Number)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}));
+
+            constexpr auto b_lds_block_desc_unmerged = transform_tensor_descriptor(
+                b_lds_block_desc_permuted,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_unmerge_transform(make_tuple(Number<KThreadReadPerm>{}, Number<N1>{})),
+                    make_unmerge_transform(make_tuple(Number<kfold>{}, Number<N0 / npair>{})),
+                    make_pass_through_transform(Number<npair>{}),
+                    make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<0>{},
+                           Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<5>{}),
+                make_tuple(Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<0, 3>{},
+                           Sequence<4, 5>{},
+                           Sequence<6>{},
+                           Sequence<7>{}));
+
+            constexpr auto b_lds_block_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_lds_block_desc_unmerged,
+                make_tuple(make_merge_transform_v3_division_mod(
+                               make_tuple(Number<KThreadReadPerm>{},
+                                          Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                                          Number<kfold>{},
+                                          Number<K0PerThreadWrite>{})),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<N0 / npair>{}, Number<npair>{}, Number<N1>{})),
+                           make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<0, 1, 4, 2>{}, Sequence<5, 6, 3>{}, Sequence<7>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return b_lds_block_desc_bk0_n_bk1;
+        }
+    }
+
+    __host__ __device__ static constexpr auto
+    // *Caution Here repeat is shuffle repeat
+    GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat()
+    {
+        constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWmma);
+        constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWmma);
+
+        constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat =
+            make_naive_tensor_descriptor_packed(
+                make_tuple(I1,
+                           Number<CShuffleMRepeatPerShuffle * MWaves * MPerWmma>{},
+                           I1,
+                           Number<CShuffleNRepeatPerShuffle * NWaves * NPerWmma>{}));
+
+        return c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat;
+    }
+
+    using BlockwiseGemmPipe = remove_cvref_t<
+        decltype(BlockGemmPipeline_Selector<
+                 BlkGemmPipelineVer,
+                 BlkGemmPipeSched,
+                 BlockSize,
+                 ADataType,
+                 BDataType,
+                 ComputeTypeA,
+                 ComputeTypeB,
+                 AccDataType,
+                 decltype(MakeAWmmaTileDescriptor(GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1())),
+                 decltype(MakeBWmmaTileDescriptor(GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1())),
+                 ABlockTransferSrcScalarPerVector,
+                 BBlockTransferSrcScalarPerVector,
+                 MPerBlock,
+                 NPerBlock,
+                 KPerBlock,
+                 MPerWmma,
+                 NPerWmma,
+                 MRepeat,
+                 NRepeat,
+                 KPack>())>;
+
+    __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
+
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
+
+        // LDS allocation for C shuffle in LDS
+        constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat =
+            GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat();
+
+        constexpr auto c_block_size =
+            c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat
+                .GetElementSpaceSize();
+
+        return math::max((a_block_space_size_aligned * sizeof(ADataType) / APackedSize +
+                          b_block_space_size_aligned * sizeof(BDataType) / BPackedSize),
+                         c_block_size * sizeof(CShuffleDataType));
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    __host__ static constexpr bool CheckValidity(const Argument& karg)
+    {
+        static_assert((MPerBlock % (MPerWmma * MRepeat) == 0) &&
+                          (NPerBlock % (NPerWmma * NRepeat)) == 0,
+                      "Invalid tuning param!");
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     !(is_same<tensor_layout::gemm::RowMajor, ALayout>::value))
+        {
+            if(!(karg.M % MPerBlock == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M value is not a multiple of MPerBlock! M: " << karg.M << " "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     (is_same<tensor_layout::gemm::RowMajor, BLayout>::value))
+        {
+            if(!(karg.N % NPerBlock == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N value is not a multiple of NPerBlock! N: " << karg.N << " "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::KPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+
+            auto K_t = karg.KBatch * KPerBlock;
+            if(!(karg.K % K_t == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K value is not a multiple of K_Batch * K0PerBlock * K1! K: "
+                              << karg.K << " " << __FILE__ << ":" << __LINE__
+                              << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
+            auto K_t                = karg.KBatch * KReadVec;
+            auto KReadPadSplited    = math::integer_divide_ceil(karg.K, K_t) * KReadVec;
+            if((KReadPadSplited * (karg.KBatch - 1)) >= karg.K)
+            {
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            if(karg.K % ABlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K (" << karg.K
+                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % ABlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M (" << karg.M
+                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+        {
+            if(karg.N % BBlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N (" << karg.N
+                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.K % BBlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K (" << karg.K
+                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+        {
+            if(karg.N % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N (" << karg.N
+                              << ") value is not a multiple of "
+                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M (" << karg.M
+                              << ") value is not a multiple of "
+                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(is_same<remove_cvref_t<CDataType>, half_t>::value ||
+                       is_same<remove_cvref_t<CDataType>, float>::value ||
+                       is_same<remove_cvref_t<CDataType>, bhalf_t>::value ||
+                       is_same<remove_cvref_t<CDataType>, int32_t>::value))
+        {
+            if(!karg.IsReduceAdd())
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << " KBatch: " << karg.KBatch << " > 1 is not supported yet"
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                if(karg.KBatch > 1)
+                {
+                    return false;
+                }
+            }
+        }
+
+        // check gridwise gemm pipeline
+        const auto num_k_loop = karg.AK0 / (KPerBlock / AK1Value);
+
+        if constexpr(BlkGemmPipelineVer != BlockGemmPipelineVersion::v1)
+        {
+            if(num_k_loop <= BlockwiseGemmPipe::PrefetchStages)
+            {
+                return false;
+            }
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return BlockwiseGemmPipe::BlockHasHotloop(num_loop);
+    }
+
+    __host__ static constexpr TailNumber CalculateKBlockLoopTailNum(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return BlockwiseGemmPipe::BlockLoopTailNum(num_loop);
+    }
+
+    template <typename CGridDesc>
+    __host__ __device__ static constexpr auto MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        const CGridDesc& c_grid_desc_m_n, index_t MBlock, index_t NBlock)
+    {
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+
+        return c_grid_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    // if arch = gfx942
+    using Block2CTileMap = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock, NPerBlock>;
+    // using Block2CTileMap = BlockToCTileMap_3DGrid_KSplit<MPerBlock, NPerBlock>;
+
+    template <typename AGridDesc_AK0_M_K1,
+              typename BGridDesc_BK0_N_K1,
+              typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+              bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run(const ADataType* p_a_grid,
+                               const BDataType* p_b_grid,
+                               CDataType* p_c_grid,
+                               void* p_shared,
+                               const Problem& problem,
+                               const AGridDesc_AK0_M_K1& a_grid_desc_ak0_m_ak1,
+                               const BGridDesc_BK0_N_K1& b_grid_desc_bk0_n_bk1,
+                               const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
+                                   c_grid_desc_mblock_mperblock_nblock_nperblock)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+        const AElementwiseOperation a_element_op{};
+        const BElementwiseOperation b_element_op{};
+        const CElementwiseOperation c_element_op{};
+
+        // divide block work by [M, N]
+        const auto block_2_ctile_map = Block2CTileMap{problem.M, problem.N, 4};
+
+        const auto block_work_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[I0]);
+        const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[I1]);
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_m_id * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_n_id * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<AK0Number, MPerBlock, AK1Number>,
+                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ADataType,
+                                                ADataType,
+                                                decltype(a_grid_desc_ak0_m_ak1),
+                                                decltype(a_block_desc_ak0_m_ak1),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<0, 1, 2>,
+                                                ABlockTransferSrcVectorDim,
+                                                2,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_AK1,
+                                                1,
+                                                1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                BlockwiseGemmPipe::GlobalBufferNum>(
+                a_grid_desc_ak0_m_ak1,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_block_desc_ak0_m_ak1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<BK0Number, NPerBlock, BK1Number>,
+                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                BDataType,
+                                                BDataType,
+                                                decltype(b_grid_desc_bk0_n_bk1),
+                                                decltype(b_block_desc_bk0_n_bk1),
+                                                BBlockTransferSrcAccessOrder,
+                                                Sequence<0, 1, 2>,
+                                                BBlockTransferSrcVectorDim,
+                                                2,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_BK1,
+                                                1,
+                                                1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                BlockwiseGemmPipe::GlobalBufferNum>(
+                b_grid_desc_bk0_n_bk1,
+                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                b_element_op,
+                b_block_desc_bk0_n_bk1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        // Cast after lds
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            reinterpret_cast<BDataType*>(static_cast<char*>(p_shared) + a_block_space_size_aligned *
+                                                                            sizeof(ADataType) /
+                                                                            APackedSize),
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1Number, 0, 0);
+
+        // Blockwise GEMM pipeline
+        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
+        auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
+        auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(a_grid_desc_ak0_m_ak1,
+                                                                         a_block_desc_ak0_m_ak1,
+                                                                         a_blockwise_copy,
+                                                                         a_grid_buf,
+                                                                         a_block_buf,
+                                                                         a_block_slice_copy_step,
+                                                                         b_grid_desc_bk0_n_bk1,
+                                                                         b_block_desc_bk0_n_bk1,
+                                                                         b_blockwise_copy,
+                                                                         b_grid_buf,
+                                                                         b_block_buf,
+                                                                         b_block_slice_copy_step,
+                                                                         c_thread_buf,
+                                                                         num_k_block_main_loop);
+
+        // shuffle C and write out
+        {
+            // C mapping in single thread.
+            constexpr auto c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs =
+                blockwise_gemm_pipeline
+                    .GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs();
+
+            // C mapping in single block
+            constexpr auto
+                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp =
+                    blockwise_gemm_pipeline
+                        .GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs();
+
+            constexpr auto MWave =
+                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp
+                    .GetLength(I1);
+            constexpr auto MSubGroup =
+                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp
+                    .GetLength(I2);
+            constexpr auto NWave =
+                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp
+                    .GetLength(I4);
+            constexpr auto NThreadPerSubGroup =
+                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp
+                    .GetLength(I5);
+            constexpr auto MAccVgprs =
+                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp
+                    .GetLength(I6);
+
+            // LDS descriptor, shuffle and write out in MRepeat x NRepeat times
+            constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat =
+                GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared),
+                c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat
+                    .GetElementSpaceSize());
+
+            constexpr auto
+                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs =
+                    transform_tensor_descriptor(
+                        c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat,
+                        make_tuple(
+                            make_freeze_transform(I0),
+                            make_unmerge_transform(make_tuple(
+                                Number<CShuffleMRepeatPerShuffle>{}, // MRepeat per shuffle repeat
+                                MWave,                               // MWave
+                                MSubGroup, // MSubGroup * MAccVgprs = MPerWmma
+                                MAccVgprs)),
+                            make_freeze_transform(I0),
+                            make_unmerge_transform(make_tuple(
+                                Number<CShuffleNRepeatPerShuffle>{}, // NRepeat per shuffle repeat
+                                NWave,                               // NWave
+                                NThreadPerSubGroup))), // NThreadPerSubGroup = NPerWmma
+                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                        make_tuple(Sequence<>{},
+                                   Sequence<0, 1, 2, 6>{},
+                                   Sequence<>{},
+                                   Sequence<3, 4, 5>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor =
+                make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(
+                                                     MRepeat, MWave, MSubGroup, MAccVgprs))),
+                                                 make_tuple(Sequence<0, 1, 2, 3>{}),
+                                                 make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor
+                    .CalculateBottomIndex(make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_nrepeat_nwave_nthreadpersubgroup_adaptor =
+                make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(
+                                                     NRepeat, NWave, NThreadPerSubGroup))),
+                                                 make_tuple(Sequence<0, 1, 2>{}),
+                                                 make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_nrepeat_nwave_nthreadpersubgroup_adaptor
+                    .CalculateBottomIndex(make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds = ThreadwiseTensorSliceTransfer_v1r3<
+                AccDataType,
+                CShuffleDataType,
+                decltype(c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs),
+                decltype(c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs),
+                ck::tensor_operation::element_wise::PassThrough,
+                Sequence<CShuffleMRepeatPerShuffle,
+                         I1,
+                         I1,
+                         CShuffleNRepeatPerShuffle,
+                         I1,
+                         I1,
+                         MAccVgprs>,
+                Sequence<0, 1, 2, 3, 4, 5, 6>,
+                6,
+                1, // vector write pixel
+                InMemoryDataOperationEnum::Set,
+                1,
+                true>{
+                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs,
+                make_multi_index(0,
+                                 m_thread_data_on_block_idx[I1],
+                                 m_thread_data_on_block_idx[I2],
+                                 0,
+                                 n_thread_data_on_block_idx[I1],
+                                 n_thread_data_on_block_idx[I2],
+                                 m_thread_data_on_block_idx[I3]),
+                ck::tensor_operation::element_wise::PassThrough{}};
+
+            // shuffle: blockwise copy C from LDS to global
+            auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
+                ThisThreadBlock,            // ThreadGroup
+                CElementwiseOperation,      // ElementwiseOperation,
+                CGlobalMemoryDataOperation, // DstInMemOp,
+                Sequence<1,
+                         CShuffleMRepeatPerShuffle * MWave * MPerWmma,
+                         1,
+                         CShuffleNRepeatPerShuffle * NWave * NPerWmma>, // BlockSliceLengths,
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                CShuffleDataType,     // typename SrcData,
+                CDataType,            // typename DstData,
+                decltype(c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat),
+                decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                Sequence<0, 1, 2, 3>,                           // typename DimAccessOrder,
+                3,                                              // index_t VectorDim,
+                CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector,
+                true,  // bool ThreadTransferSrcResetCoordinateAfterRun,
+                false> // bool ThreadTransferDstResetCoordinateAfterRun>
+                {c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat,
+                 make_multi_index(0, 0, 0, 0),
+                 c_grid_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0),
+                 c_element_op};
+
+            // space filling curve for local reg & global memory
+            // space filling curve for threadwise C in VGPR
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MRepeat, 1, 1, NRepeat, 1, 1, MAccVgprs>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6>,
+                                  Sequence<CShuffleMRepeatPerShuffle,
+                                           1,
+                                           1,
+                                           CShuffleNRepeatPerShuffle,
+                                           1,
+                                           1,
+                                           MAccVgprs>>{};
+
+            // space filling curve for shuffled blockwise C in global mem
+            constexpr auto sfc_c_global =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMRepeatPerShuffle * MWave * MPerWmma,
+                                           1,
+                                           CShuffleNRepeatPerShuffle * NWave * NPerWmma>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!");
+
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(
+                    c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs,
+                    sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                    c_thread_buf,
+                    c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs,
+                    c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                c_shuffle_block_copy_lds_to_global.Run(
+                    c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat,
+                    c_shuffle_block_buf,
+                    c_grid_desc_mblock_mperblock_nblock_nperblock,
+                    c_grid_buf);
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
+
+                    // move on C
+                    c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow(
+                        c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
+                }
+            });
+        }
+    }
+
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run(const ADataType* p_a_grid,
+                               const BDataType* p_b_grid,
+                               CDataType* p_c_grid,
+                               void* p_shared,
+                               const Problem& problem)
+    {
+        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+            problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
+        const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
+            problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(
+            problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+        Run<decltype(a_grid_desc_ak0_m_ak1),
+            decltype(b_grid_desc_bk0_n_bk1),
+            decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+            HasMainKBlockLoop,
+            CGlobalMemoryDataOperation,
+            TailNum>(p_a_grid,
+                     p_b_grid,
+                     p_c_grid,
+                     p_shared,
+                     problem,
+                     a_grid_desc_ak0_m_ak1,
+                     b_grid_desc_bk0_n_bk1,
+                     c_grid_desc_mblock_mperblock_nblock_nperblock);
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_bwd_weight_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp
similarity index 99%
rename from include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_bwd_weight_v3.hpp
rename to include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp
index 08d9386d72..d37b3cd38e 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_bwd_weight_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -81,8 +81,9 @@ struct GridwiseGemm_xdl_cshuffle_v3
 
     static constexpr auto lcm_AK1_BK1 = math::lcm(AK1Number, BK1Number);
     static constexpr bool is_single_rate_mfma =
-        ((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
-         lcm_AK1_BK1 <= 4)
+        (((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
+          lcm_AK1_BK1 <= 4) ||
+         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8))
             ? true
             : false;
     static constexpr index_t KPack =
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
index fcb12f4a14..e5e32a8535 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
@@ -141,8 +142,9 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
 
     static constexpr auto lcm_AK1_BK1 = math::lcm(AK1Number, BK1Number);
     static constexpr bool is_single_rate_mfma =
-        ((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
-         lcm_AK1_BK1 <= 4)
+        (((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
+          lcm_AK1_BK1 <= 4) ||
+         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8))
             ? true
             : false;
     static constexpr index_t KPack =
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
index c293d64ef0..240bc464e1 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
@@ -810,9 +810,17 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
         //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
         //       register
         // sanity check
+        constexpr auto lcm_AK1_BK1 = math::lcm(AK1Number, BK1Number);
+        constexpr bool is_single_rate_mfma =
+            (((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
+              lcm_AK1_BK1 <= 4) ||
+             (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8))
+                ? true
+                : false;
         constexpr index_t KPack = math::max(
-            math::lcm(AK1Number, BK1Number),
-            MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeB>::selected_mfma.k_per_blk);
+            lcm_AK1_BK1,
+            MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeB, is_single_rate_mfma>::
+                selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp
index af91721c8a..c7d44e842d 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp
@@ -871,8 +871,9 @@ struct GridwiseGemm_xdl_cshuffle_v2
         // sanity check
         constexpr auto lcm_AK1_BK1 = math::lcm(AK1Number, BK1Number);
         constexpr bool is_single_rate_mfma =
-            ((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
-             lcm_AK1_BK1 <= 4)
+            (((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
+              lcm_AK1_BK1 <= 4) ||
+             (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8))
                 ? true
                 : false;
         constexpr index_t KPack = math::max(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
old mode 100755
new mode 100644
index d4c915aa5e..29150c0688
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -1,9 +1,10 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
@@ -82,6 +83,109 @@ __global__ void
 #endif // end of if (defined(__gfx9__))
 }
 
+/// @brief \"Universal\" GEMM kernel with SplitK support.
+///
+/// @par Overview
+///         This GEMM kernel is carrying out following mathematical equation:
+///         C{M,N} = C_op(A_op(A{M,K}) * B_op(B{K,N}))
+///         Where A, B are input tensors and C is the output tensor. The A/B/C_op are
+///         elementwise operations that could be applied on each tensor respectively.
+///         The \"universal\" gemm comes with multiple pipelines optimized for different usage
+///         scenarios. That's why it's called \"universal\". It's universal through it's design
+///         and versatilty.
+///
+/// @note   This Kernel implementation supports SplitK algorithm. It can be configured
+///         to split the dot product accumulated over the K dimension into multiple working groups.
+///         The partial products of different workgroups are then reduced using the AtomicAdd
+///         operation.
+///
+/// @tparam ALayout     A tensor data layout.
+/// @tparam BLayout     B tensor data layout.
+/// @tparam CLayout     C tensor data layout.
+/// @tparam ADataType   A tensor data type.
+/// @tparam BDataType   B tensor data type.
+/// @tparam AccDataType The accumulation data type related to the hardware
+///                         matrix-multiplication instruction.
+/// @tparam CShuffleDataType The data type used to store matrix-multiplication results into
+///                          LDS memory during \"CShuffle\" data layout optimization.
+/// @tparam CDataType   C tensor data type.
+/// @tparam AElementwiseOperation Elementwise operation applied to the A input tensor elements.
+/// @tparam BElementwiseOperation Elementwise operation applied to the B input tensor elements.
+/// @tparam CElementwiseOperation Elementwise operation applied to the C output tensor
+///                               (after GEMM).
+/// @tparam GemmSpec    Determines used "padding" version.
+/// @tparam BlockSize   The number of threads within workgroup.
+/// @tparam MPerBlock   The input/output data tile size in the M dimension.
+/// @tparam NPerBlock   The input/output data tile size in the N dimension.
+/// @tparam KPerBlock   The input data tile size in the K dimension.
+/// @tparam AK1Value    The vector load size from global memory for A tensor.
+/// @tparam BK1Value    The vector load size from global memory for B tensor.
+/// @tparam MPerXdl     M size of matrix-fused-multiply-add instruction.
+/// @tparam NPerXdl     N size of matrix-fused-multiply-add instruction.
+/// @tparam MXdlPerWave The number of iterations in the M dimension over output tile per wavefront.
+/// @tparam NXdlPerWave The number of iterations in the N dimension over output tile per wavefront.
+/// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question, "How many threads can be
+///                                                      arranged on each input data axis?"
+/// @tparam ABlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam ABlockTransferSrcVectorDim   The index of axis on which we could do vectorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam ABlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam ABlockTransferDstScalarPerVector_AK1 The size of vectorized store into LDS memory.
+/// @tparam AThreadTransferSrcResetCoordinateAfterRun   Decides whether we reset thread coordinate
+///                          (return back to the window origin) after all thread finish data copy.
+/// @tparam ABlockLdsExtraM                      Whether to use padding for LDS or not. With
+///                                              universal GEMM there's no need for padding.
+/// @tparam BBlockTransferThreadClusterLengths_BK0_N_BK1 Spatial thread distribution over the input
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question: "How many threads to
+///                                                      arrange on each input data axis?"
+/// @tparam BBlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam BBlockTransferSrcAccessOrder he order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam BBlockTransferSrcVectorDim  The index of axis on which we could do vectorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam BBlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam BBlockTransferDstScalarPerVector_BK1 The size of vectorized store into LDS memory.
+/// @tparam BThreadTransferSrcResetCoordinateAfterRun   Decides whether we reset thread coordinate
+///                          (return back to the window origin) after all thread finish data copy.
+/// @tparam BBlockLdsExtraN                      Whether to use padding for LDS or not. With
+///                                              universal GEMM there's no need for padding.
+/// @tparam CShuffleMXdlPerWavePerShuffle   The number of matrix-multiplication instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in M dimension.
+/// @tparam CShuffleNXdlPerWavePerShuffle   The number of matrix-multiplication instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in N dimension.
+/// @tparam CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock The spatial
+///                                         thread distribution used for storing data into output
+///                                         tensor across output data layout dimensions.
+/// @tparam CShuffleBlockTransferScalarPerVector_NPerBlock The size of vectorized memory access.
+///                                         Used when storing data to output tensor.
+/// @tparam BlkGemmPipeSched    The version of blockwise-gemm pipeline scheduler (interwave or
+///                             intrawave).
+/// @tparam BlkGemmPipelineVer  The version of blockwise-gemm pipeline.
+/// @tparam ComputeTypeA    Data type used for A input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam ComputeTypeB    Data type used for B input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam PermuteA            Whether the A input tensor has gridwise-gemm friendly data layout
+///                             in global memory. Currently not supported!
+/// @tparam PermuteB            Whether the B input tensor has gridwise-gemm friendly data layout
+///                             in global memory (pre-shuffled).
 template <typename ALayout,
           typename BLayout,
           typename CLayout,
@@ -149,8 +253,9 @@ struct GridwiseGemm_xdl_cshuffle_v3
 
     static constexpr auto lcm_AK1_BK1 = math::lcm(AK1Number, BK1Number);
     static constexpr bool is_single_rate_mfma =
-        ((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
-         lcm_AK1_BK1 <= 4)
+        (((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
+          lcm_AK1_BK1 <= 4) ||
+         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8))
             ? true
             : false;
     static constexpr index_t KPack =
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
new file mode 100644
index 0000000000..a22fc06a50
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
@@ -0,0 +1,1874 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+// Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same
+// kernel function Blockers:
+// 1. Two separted declaration of __shared__ pointer is the key to make sure data access operate on
+// two lds chunks.
+// 2. Occupied __shared__ won't release until whole shader end, a.k.a AB and C may not use same lds
+// buffer when we declare __shared__ inside blkgemmpipe
+template <typename GridwiseGemm,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Even>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+    // __attribute__((amdgpu_waves_per_eu(1, 1)))
+    kernel_gemm_xdl_cshuffle_v3_b_preshuffle(typename GridwiseGemm::Argument karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
+
+    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+        karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
+        p_shared,
+        karg);
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx9__))
+}
+
+template <typename GridwiseGemm,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Even>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+    // __attribute__((amdgpu_waves_per_eu(1, 1)))
+    kernel_gemm_xdl_cshuffle_v3_b_preshuffle_2lds(typename GridwiseGemm::Argument karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+    // Pass two lds pointer is the key to tell compiler that ds_read/write
+    // operate on different lds chunk at same time without order dependecy
+    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
+
+    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+        karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
+        p_shared_0,
+        p_shared_1,
+        karg);
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx9__))
+}
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          tensor_operation::device::GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1Value,
+          index_t BK1Value,
+          index_t MPerXdl,
+          index_t NPerXdl,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          typename ComputeTypeA                       = CDataType,
+          typename ComputeTypeB                       = ComputeTypeA,
+          bool PermuteA                               = false,
+          bool PermuteB                               = false>
+struct GridwiseGemm_xdl_cshuffle_v3_b_preshuffle
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    // K1 should be Number<...>
+    static constexpr auto AK0Number = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0Number = Number<KPerBlock / BK1Value>{};
+    static constexpr auto AK1Number = Number<AK1Value>{};
+    static constexpr auto BK1Number = Number<BK1Value>{};
+
+    using mfma_selector = MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeB>;
+
+    static constexpr index_t KPack =
+        math::max(math::lcm(AK1Number, BK1Number), mfma_selector::selected_mfma.k_per_blk);
+
+    static constexpr index_t KLane =
+        mfma_selector::GetKPerXdlops() / mfma_selector::GetK1PerXdlops();
+    static constexpr index_t KRepeat = KPerBlock / KLane / KPack;
+    static constexpr index_t NLane   = NPerXdl;
+    static constexpr index_t NWave   = NPerBlock / NPerXdl / NXdlPerWave;
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    static constexpr index_t APackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    static constexpr index_t BPackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    __host__ static auto CalculateGridSize(index_t M, index_t N, index_t KBatch)
+    {
+        return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, KBatch);
+    }
+
+    __host__ static auto CalculateMPadded(index_t M)
+    {
+        return math::integer_least_multiple(M, MPerBlock);
+    }
+
+    __host__ static auto CalculateNPadded(index_t N)
+    {
+        return math::integer_least_multiple(N, NPerBlock);
+    }
+
+    __host__ __device__ static auto CalculateBN0Shuffled(index_t N)
+    {
+        return math::integer_divide_ceil(N, NLane);
+    }
+
+    __host__ __device__ static auto CalculateBK0Shuffled(index_t K)
+    {
+        return math::integer_divide_ceil(K, KLane * KPack);
+    }
+
+    __host__ static auto CalculateKPadded(index_t K)
+    {
+        return math::integer_divide_ceil(K, KPerBlock) * KPerBlock;
+    }
+
+    __host__ static auto CalculateAK0Padded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * (KPerBlock / AK1Value);
+    }
+
+    __host__ static auto CalculateBK0Padded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * (KPerBlock / BK1Value);
+    }
+
+    __host__ static auto CalculateKPadded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * KPerBlock;
+    }
+
+    __host__ static auto CalculateKRead(index_t K, index_t K_Batch = 1)
+    {
+        constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
+        auto K_t                = K_Batch * KReadVec;
+        return (K + K_t - 1) / K_t * KReadVec;
+    }
+
+    __host__ static auto CalculateMBlock(index_t M)
+    {
+        return math::integer_divide_ceil(M, MPerBlock);
+    }
+
+    __host__ static auto CalculateNBlock(index_t N)
+    {
+        return math::integer_divide_ceil(N, NPerBlock);
+    }
+
+    template <index_t MNXdlPerWave, index_t MNWaves, index_t MNPerXdl, typename TileDesc_K0_MN_K1>
+    __host__ __device__ static constexpr auto MakeGemmMmaTileDescriptor(const TileDesc_K0_MN_K1&)
+    {
+        constexpr index_t K0 = TileDesc_K0_MN_K1{}.GetLength(Number<0>{});
+        constexpr index_t K1 = TileDesc_K0_MN_K1{}.GetLength(Number<2>{});
+
+        return transform_tensor_descriptor(
+            TileDesc_K0_MN_K1{},
+            make_tuple(make_merge_transform_v3_division_mod(make_tuple(Number<K0>{}, Number<K1>{})),
+                       make_unmerge_transform(make_tuple(
+                           Number<MNXdlPerWave>{}, Number<MNWaves>{}, Number<MNPerXdl>{}))),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
+            make_tuple(Sequence<3>{}, Sequence<0, 1, 2>{}));
+    }
+
+    __host__ __device__ static auto MakeAGridDescriptor_AK0_M_AK1(
+        index_t M, index_t MPad, index_t K, index_t KPad, index_t StrideA, index_t AK0)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both M and K
+            const auto a_grid_desc_m_k =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_right_pad_transform(M, MPad - M),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(MPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad M, but not K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_right_pad_transform(M, MPad - M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad K, but not M
+            const auto a_grid_desc_m_k = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else
+        {
+            // not pad M or K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+    }
+
+    __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
+    {
+        constexpr index_t NkSwizzleNumber = Number<warpSize * KPack>{};
+        return make_naive_tensor_descriptor(
+            make_tuple(N0 / NWave, NWave, K0, NkSwizzleNumber),
+            make_tuple(NWave * K0 * NkSwizzleNumber, K0 * NkSwizzleNumber, NkSwizzleNumber, I1));
+    }
+
+    __host__ __device__ static auto MakeBGridDescriptor_BK0_N_BK1(
+        index_t K, index_t KPad, index_t N, index_t NPad, index_t StrideB, index_t BK0)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(StrideB, I1));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        static_assert(!(is_same_v<remove_cvref_t<ADataType>, pk_i4_t> &&
+                        GemmSpec != GemmSpecialization::Default),
+                      "pk_i4_t does not support padding");
+
+        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both N and K
+            const auto b_grid_desc_n_k =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_right_pad_transform(N, NPad - N),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(NPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad N, but not K
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad K, but not N
+            const auto b_grid_desc_n_k = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_pass_through_transform(N), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else
+        {
+            if constexpr(!PermuteB)
+            {
+                // not pad N or K
+                const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                    b_grid_desc_nraw_kraw,
+                    make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                               make_pass_through_transform(N)),
+                    make_tuple(Sequence<1>{}, Sequence<0>{}),
+                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+                return b_grid_desc_bk0_n_bk1;
+            }
+            else
+            {
+                // Pre-shuffled Weight
+                // BGlobal[K / KPerBlock, N, KPerBlock / K1, K1] -> BTile[K / K1, N, K1]
+                constexpr index_t BK01 = KPerBlock / BK1Value;
+                const index_t BK0_     = StrideB / BK1Value;
+                const index_t BK00     = BK0_ / BK01;
+
+                const auto b_grid_desc_bk00_n_bk01_bk1_permute =
+                    make_naive_tensor_descriptor_packed(make_tuple(BK00, N, BK01, BK1Value));
+
+                const auto b_grid_desc_bk0_n_bk1_permute = transform_tensor_descriptor(
+                    b_grid_desc_bk00_n_bk01_bk1_permute,
+                    make_tuple(make_merge_transform(make_tuple(BK00, BK01)),
+                               make_pass_through_transform(make_tuple(N)),
+                               make_pass_through_transform(BK1Value)),
+                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+                return b_grid_desc_bk0_n_bk1_permute;
+            }
+        }
+    }
+
+    template <typename ABlockDesc_AK0_M_AK1>
+    __host__ __device__ static constexpr auto
+    MakeAMmaTileDescriptor_M0_M1_M2_K(const ABlockDesc_AK0_M_AK1&)
+    {
+        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
+
+        return MakeGemmMmaTileDescriptor<MXdlPerWave, MWaves, MPerXdl>(ABlockDesc_AK0_M_AK1{});
+    }
+
+    template <typename BBlockDesc_BK0_N_BK1>
+    __host__ __device__ static constexpr auto
+    MakeBMmaTileDescriptor_N0_N1_N2_K(const BBlockDesc_BK0_N_BK1&)
+    {
+        // constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        // return MakeGemmMmaTileDescriptor<NXdlPerWave, NWaves, NPerXdl>(BBlockDesc_BK0_N_BK1{});
+
+        return MakeGemmMmaTileDescriptor<NXdlPerWave, NWave, NPerXdl>(BBlockDesc_BK0_N_BK1{});
+    }
+
+    __host__ __device__ static auto
+    MakeCGridDescriptor_M_N(index_t M, index_t MPad, index_t N, index_t NPad, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        // pad M and N
+        return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                           make_tuple(make_right_pad_transform(M, MPad - M),
+                                                      make_right_pad_transform(N, NPad - N)),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}));
+#if 0
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad M and N
+            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                               make_tuple(make_right_pad_transform(M, MPad - M),
+                                                          make_right_pad_transform(N, NPad - N)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad M, but not N
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad N, but not M
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            // not pad M or N
+            return c_grid_desc_mraw_nraw;
+        }
+#endif
+    }
+
+    struct Problem
+    {
+        __host__ Problem(index_t M_,
+                         index_t N_,
+                         index_t K_,
+                         index_t StrideA_,
+                         index_t StrideB_,
+                         index_t StrideC_,
+                         index_t KBatch_)
+            : M{M_},
+              N{N_},
+              K{K_},
+              StrideA{StrideA_},
+              StrideB{StrideB_},
+              StrideC{StrideC_},
+              KBatch{KBatch_},
+              MPadded{CalculateMPadded(M_)},
+              NPadded{CalculateNPadded(N_)},
+              KRead{CalculateKRead(K_, KBatch_)},
+              KPadded{CalculateKPadded(K_, KBatch_)},
+              AK0{CalculateAK0Padded(K_, KBatch_)},
+              BK0{CalculateBK0Padded(K_, KBatch_)},
+              MBlock{CalculateMBlock(M_)},
+              NBlock{CalculateNBlock(N_)},
+              BN0Shuffled{CalculateBN0Shuffled(N_)},
+              BK0Shuffled{CalculateBK0Shuffled(K_)}
+        {
+        }
+
+        __host__ void Print() const
+        {
+            std::cout << "problem {"
+                      << "M:" << M << ", "
+                      << "N:" << N << ", "
+                      << "K:" << K << ", "
+                      << "SA:" << StrideA << ", "
+                      << "SB:" << StrideB << ", "
+                      << "SC:" << StrideC << ", "
+                      << "MP:" << MPadded << ", "
+                      << "NP:" << NPadded << ", "
+                      << "KRead:" << KRead << ", "
+                      << "KP:" << KPadded << ", "
+                      << "AK0:" << AK0 << ", "
+                      << "BK0:" << BK0 << ", "
+                      << "MBlock: " << MBlock << ", "
+                      << "NBlock: " << NBlock << "}" << std::endl;
+        }
+
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t StrideA;
+        index_t StrideB;
+        index_t StrideC;
+        index_t KBatch;
+        index_t MPadded;
+        index_t NPadded;
+        index_t KRead;
+        index_t KPadded;
+        index_t AK0;
+        index_t BK0;
+        index_t MBlock;
+        index_t NBlock;
+        // For B pre-shuffle only
+        index_t BN0Shuffled;
+        index_t BK0Shuffled;
+    };
+
+    // Argument
+    struct Argument : public tensor_operation::device::BaseArgument, public Problem
+    {
+        __host__ Argument(const ADataType* p_a_grid_,
+                          const BDataType* p_b_grid_,
+                          CDataType* p_c_grid_,
+                          index_t M_,
+                          index_t N_,
+                          index_t K_,
+                          index_t StrideA_,
+                          index_t StrideB_,
+                          index_t StrideC_,
+                          index_t k_batch_,
+                          bool is_reduce_ = false)
+            : Problem{M_, N_, K_, StrideA_, StrideB_, StrideC_, k_batch_},
+              p_a_grid{p_a_grid_},
+              p_b_grid{p_b_grid_},
+              p_c_grid{p_c_grid_},
+              is_reduce(is_reduce_)
+        {
+        }
+
+        __host__ __device__ inline bool IsReduceAdd() const
+        {
+            return (Problem::KBatch > 1) && is_reduce;
+        }
+
+        __host__ __device__ inline bool IsAtomicAdd() const
+        {
+            return (Problem::KBatch > 1) && (!is_reduce);
+        }
+
+        const ADataType* p_a_grid;
+        const BDataType* p_b_grid;
+        CDataType* p_c_grid;
+        bool is_reduce;
+    };
+
+    struct SplitKBatchOffset
+    {
+
+        __device__ SplitKBatchOffset(Argument& karg)
+        {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                a_k_split_offset = blockIdx.z * karg.KRead / APackedSize;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                a_k_split_offset = blockIdx.z * karg.KRead * karg.StrideA;
+            }
+
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
+            {
+                b_k_split_offset = blockIdx.z * karg.KRead * karg.StrideB;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
+            {
+                if constexpr(!PermuteB)
+                {
+                    // b_k_split_offset = blockIdx.z * karg.KRead / BPackedSize;
+
+                    b_k_split_offset = blockIdx.z * karg.KRead * NLane / BPackedSize;
+                }
+                else
+                {
+                    const int k0_offset = karg.KRead * karg.N;
+                    b_k_split_offset    = blockIdx.z * k0_offset / BPackedSize;
+                }
+            }
+
+            if(blockIdx.z < static_cast<uint32_t>(karg.KBatch - 1))
+            {
+                karg.K = karg.KRead;
+            }
+            else
+            {
+                karg.K = karg.K - karg.KRead * (karg.KBatch - 1);
+            }
+
+            if(karg.IsReduceAdd())
+            {
+                c_reduce_offset = blockIdx.z * karg.M * karg.N;
+            }
+            else
+            {
+                c_reduce_offset = 0;
+            }
+        }
+
+        index_t a_k_split_offset;
+        index_t b_k_split_offset;
+        index_t c_reduce_offset;
+    };
+
+    __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    {
+        // A matrix in LDS memory, dst of blockwise copy
+        if constexpr(ABlockLdsExtraM)
+        {
+            return make_naive_tensor_descriptor(
+                make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
+                make_tuple(AK1Number, Number<KPerBlock + ABlockLdsExtraM>{}, I1));
+        }
+        // xor tensor transformation request more unnecessary vgpr usage, would cause register spill
+        // in some cases.
+        else if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            constexpr auto a_lds_block_desc =
+                make_naive_tensor_descriptor(make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
+                                             make_tuple(AK1Number, Number<KPerBlock>{}, I1));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(make_xor_with_modulo_transform(
+                               make_tuple(Number<MPerBlock>{}, Number<AK0Number>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
+
+            return a_lds_block_desc_permuted;
+        }
+        else // ColumnMajor A
+        {
+            // kfold and mpair dimension is not always required.
+            // more dimension in merge_transform increase the difficulty of generating immarg offset
+            // for compiler.
+            constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+            constexpr auto M1 = MPerBlock / M0;
+
+            constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
+            constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
+            constexpr auto KThreadRead      = 64 / MPerXdl;
+            constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
+
+            constexpr auto kfold = (AK1Number * M0 * sizeof(ADataType) > 128)
+                                       ? 1
+                                       : 128 / (AK1Number * M0 * sizeof(ADataType));
+            constexpr auto KThreadReadPerm =
+                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
+                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
+                    : KThreadRead;
+
+            // 1<=mpair<=n0
+            constexpr auto mpair = (AK1Number * MPerXdl * sizeof(ADataType) > 128)
+                                       ? 1
+                                       : ((128 / (AK1Number * MPerXdl * sizeof(ADataType))) > M0
+                                              ? M0
+                                              : 128 / (AK1Number * MPerXdl * sizeof(ADataType)));
+
+            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor_packed(
+                make_tuple(Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                           Number<K0PerThreadWrite>{},
+                           Number<KThreadReadPerm * M1>{},
+                           Number<kfold * M0 / mpair>{},
+                           Number<mpair>{},
+                           AK1Number));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_xor_with_modulo_transform(
+                        make_tuple(Number<KThreadReadPerm * M1>{}, Number<kfold * M0 / mpair>{})),
+                    make_pass_through_transform(Number<mpair>{}),
+                    make_pass_through_transform(AK1Number)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}));
+
+            constexpr auto a_lds_block_desc_unmerged = transform_tensor_descriptor(
+                a_lds_block_desc_permuted,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_unmerge_transform(make_tuple(Number<KThreadReadPerm>{}, Number<M1>{})),
+                    make_unmerge_transform(make_tuple(Number<kfold>{}, Number<M0 / mpair>{})),
+                    make_pass_through_transform(Number<mpair>{}),
+                    make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0>{},
+                           Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<5>{}),
+                make_tuple(Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<0, 3>{},
+                           Sequence<4, 5>{},
+                           Sequence<6>{},
+                           Sequence<7>{}));
+
+            constexpr auto a_lds_block_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_lds_block_desc_unmerged,
+                make_tuple(make_merge_transform_v3_division_mod(
+                               make_tuple(Number<KThreadReadPerm>{},
+                                          Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                                          Number<kfold>{},
+                                          Number<K0PerThreadWrite>{})),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<M0 / mpair>{}, Number<mpair>{}, Number<M1>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0, 1, 4, 2>{}, Sequence<5, 6, 3>{}, Sequence<7>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return a_lds_block_desc_ak0_m_ak1;
+        }
+    }
+
+    __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // K0 -> N0/NWave -> NWave -> KLane -> NLane -> KPack
+        return make_naive_tensor_descriptor_packed(
+            make_tuple(Number<NXdlPerWave>{},
+                       I1,
+                       Number<KRepeat>{},
+                       Number<BK1Value>{})); //??? BK1Value same as KPack?
+    }
+
+    __device__ static constexpr auto GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
+    {
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            make_naive_tensor_descriptor_packed(
+                make_tuple(I1,
+                           Number<CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl>{},
+                           I1,
+                           Number<CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>{}));
+
+        return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    using BlockwiseGemmPipe =
+        remove_cvref_t<decltype(BlockGemmBPreshufflePipeline_Selector<
+                                BlkGemmPipelineVer,
+                                BlkGemmPipeSched,
+                                BlockSize,
+                                ADataType,
+                                BDataType,
+                                ComputeTypeA,
+                                AccDataType,
+                                decltype(GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()),
+                                decltype(GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()),
+                                decltype(MakeAMmaTileDescriptor_M0_M1_M2_K(
+                                    GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1())),
+                                decltype(MakeBMmaTileDescriptor_N0_N1_N2_K(
+                                    GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1())),
+                                ABlockTransferSrcScalarPerVector,
+                                BBlockTransferSrcScalarPerVector,
+                                MPerBlock,
+                                NPerBlock,
+                                KPerBlock,
+                                MPerXdl,
+                                NPerXdl,
+                                MXdlPerWave,
+                                NXdlPerWave,
+                                KPack>())>;
+
+    __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
+
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        // LDS allocation for C shuffle in LDS
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+        constexpr auto c_block_size =
+            c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
+
+        return math::max(a_block_space_size_aligned * sizeof(ADataType) / APackedSize,
+                         c_block_size * sizeof(CShuffleDataType));
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    __host__ static constexpr bool CheckValidity(const Argument& karg)
+    {
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
+                      "Invalid tuning param!");
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     !(is_same<tensor_layout::gemm::RowMajor, ALayout>::value))
+        {
+            if(!(karg.M % MPerBlock == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M value is not a multiple of MPerBlock! M: " << karg.M << " "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     (is_same<tensor_layout::gemm::RowMajor, BLayout>::value))
+        {
+            if(!(karg.N % NPerBlock == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N value is not a multiple of NPerBlock! N: " << karg.N << " "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::KPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+
+            auto K_t = karg.KBatch * KPerBlock;
+            if(!(karg.K % K_t == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K value is not a multiple of K_Batch * K0PerBlock * K1! K: "
+                              << karg.K << " " << __FILE__ << ":" << __LINE__
+                              << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
+            auto K_t                = karg.KBatch * KReadVec;
+            auto KReadPadSplited    = math::integer_divide_ceil(karg.K, K_t) * KReadVec;
+            if((KReadPadSplited * (karg.KBatch - 1)) >= karg.K)
+            {
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            if(karg.K % ABlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K (" << karg.K
+                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % ABlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M (" << karg.M
+                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+        {
+            if(karg.N % BBlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N (" << karg.N
+                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.K % BBlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K (" << karg.K
+                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+        {
+            if(karg.N % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N (" << karg.N
+                              << ") value is not a multiple of "
+                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M (" << karg.M
+                              << ") value is not a multiple of "
+                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(is_same<remove_cvref_t<CDataType>, half_t>::value ||
+                       is_same<remove_cvref_t<CDataType>, float>::value ||
+                       is_same<remove_cvref_t<CDataType>, bhalf_t>::value ||
+                       is_same<remove_cvref_t<CDataType>, int32_t>::value))
+        {
+            if(!karg.IsReduceAdd())
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << " KBatch: " << karg.KBatch << " > 1 is not support yet" << __FILE__
+                              << ":" << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                if(karg.KBatch > 1)
+                {
+                    return false;
+                }
+            }
+        }
+
+        // check gridwise gemm pipeline
+        const auto num_k_loop = karg.AK0 / (KPerBlock / AK1Value);
+
+        if(num_k_loop <= BlockwiseGemmPipe::PrefetchStages)
+        {
+            return false;
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return BlockwiseGemmPipe::BlockHasHotloop(num_loop);
+    }
+
+    __host__ static constexpr TailNumber CalculateKBlockLoopTailNum(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return BlockwiseGemmPipe::BlockLoopTailNum(num_loop);
+    }
+
+    template <typename CGridDesc>
+    __host__ __device__ static constexpr auto MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        const CGridDesc& c_grid_desc_m_n, index_t MBlock, index_t NBlock)
+    {
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+
+        return c_grid_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    // if arch = gfx942
+    using Block2CTileMap = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock, NPerBlock>;
+    // using Block2CTileMap = BlockToCTileMap_3DGrid_KSplit<MPerBlock, NPerBlock>;
+
+    template <typename AGridDesc_AK0_M_K1,
+              typename BGridDesc_BPreshuffled,
+              typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+              bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run(const ADataType* p_a_grid,
+                               const BDataType* p_b_grid,
+                               CDataType* p_c_grid,
+                               void* p_shared,
+                               const Problem& problem,
+                               const AGridDesc_AK0_M_K1& a_grid_desc_ak0_m_ak1,
+                               const BGridDesc_BPreshuffled& b_grid_desc_bpreshuffled,
+                               const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
+                                   c_grid_desc_mblock_mperblock_nblock_nperblock)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_bpreshuffled.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+        const AElementwiseOperation a_element_op{};
+        // const BElementwiseOperation b_element_op{};
+        const CElementwiseOperation c_element_op{};
+
+        // divide block work by [M, N]
+        const auto block_2_ctile_map = Block2CTileMap{problem.M, problem.N, 4};
+
+        const auto block_work_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[I0]);
+        const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[I1]);
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_m_id * MPerBlock);
+
+        // N0, K0, Blocksize*KPack
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_n_id * NXdlPerWave);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<AK0Number, MPerBlock, AK1Number>,
+                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ADataType,
+                                                ADataType,
+                                                decltype(a_grid_desc_ak0_m_ak1),
+                                                decltype(a_block_desc_ak0_m_ak1),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<0, 1, 2>,
+                                                ABlockTransferSrcVectorDim,
+                                                2,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_AK1,
+                                                1,
+                                                1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                BlockwiseGemmPipe::GlobalBufferNum>(
+                a_grid_desc_ak0_m_ak1,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_block_desc_ak0_m_ak1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // B matrix threadwise copy, using threadwiseTensorSliceTransfer_v2
+        auto b_block_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        auto b_blockwise_copy = ThreadwiseTensorSliceTransfer_v2<
+            BDataType,
+            BDataType,
+            decltype(b_grid_desc_bpreshuffled),
+            decltype(b_block_desc_bk0_n_bk1),
+            Sequence<Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}>,
+            Sequence<1, 2, 0, 3>,
+            3,
+            BBlockTransferSrcScalarPerVector,
+            BThreadTransferSrcResetCoordinateAfterRun,
+            true>(b_grid_desc_bpreshuffled,
+                  make_multi_index(n_block_data_idx_on_grid,
+                                   get_warp_local_1d_id() % NWave,
+                                   0,
+                                   KPack * (get_thread_local_1d_id() % warpSize)));
+
+        // LDS allocation for A and B: be careful of alignment
+
+        // Cast after lds
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(0, 0, KRepeat, 0);
+
+        // Blockwise GEMM pipeline
+        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
+        auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
+        auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(a_grid_desc_ak0_m_ak1,
+                                                                         a_block_desc_ak0_m_ak1,
+                                                                         a_blockwise_copy,
+                                                                         a_grid_buf,
+                                                                         a_block_buf,
+                                                                         a_block_slice_copy_step,
+                                                                         b_grid_desc_bpreshuffled,
+                                                                         b_blockwise_copy,
+                                                                         b_grid_buf,
+                                                                         b_block_buf,
+                                                                         b_block_slice_copy_step,
+                                                                         c_thread_buf,
+                                                                         num_k_block_main_loop);
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
+                        M1,                                      // M1 = MWave
+                        M2,                                      // M2 * M3 * M4 = MPerXdl
+                        M3,
+                        M4)),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
+                        N1,                                      // N1 = NWave
+                        N2))),                                   // N2 = NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   CShuffleDataType,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+
+            // shuffle: blockwise copy C from LDS to global
+            auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
+                ThisThreadBlock,            // ThreadGroup
+                CElementwiseOperation,      // ElementwiseOperation,
+                CGlobalMemoryDataOperation, // DstInMemOp,
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                CShuffleDataType,     // typename SrcData,
+                CDataType,            // typename DstData,
+                decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                Sequence<0, 1, 2, 3>,                           // typename DimAccessOrder,
+                3,                                              // index_t VectorDim,
+                CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector,
+                true,  // bool ThreadTransferSrcResetCoordinateAfterRun,
+                false> // bool ThreadTransferDstResetCoordinateAfterRun>
+                {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(0, 0, 0, 0),
+                 c_grid_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(block_m_id, 0, block_n_id, 0),
+                 c_element_op};
+
+            // space filling curve for threadwise C in VGPR
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle,
+                                           CShuffleNXdlPerWavePerShuffle,
+                                           1,
+                                           1,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>>{};
+
+            // space filling curve for shuffled blockwise C in global mem
+            constexpr auto sfc_c_global =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!");
+
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf,
+                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                c_shuffle_block_copy_lds_to_global.Run(
+                    c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                    c_shuffle_block_buf,
+                    c_grid_desc_mblock_mperblock_nblock_nperblock,
+                    c_grid_buf);
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
+
+                    // move on C
+                    c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow(
+                        c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
+                }
+            });
+        }
+    }
+
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run(const ADataType* p_a_grid,
+                               const BDataType* p_b_grid,
+                               CDataType* p_c_grid,
+                               void* p_shared,
+                               const Problem& problem)
+    {
+        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+            problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
+        const auto b_grid_desc_bpreshuffled =
+            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(
+            problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+        Run<decltype(a_grid_desc_ak0_m_ak1),
+            decltype(b_grid_desc_bpreshuffled),
+            decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+            HasMainKBlockLoop,
+            CGlobalMemoryDataOperation,
+            TailNum>(p_a_grid,
+                     p_b_grid,
+                     p_c_grid,
+                     p_shared,
+                     problem,
+                     a_grid_desc_ak0_m_ak1,
+                     b_grid_desc_bpreshuffled,
+                     c_grid_desc_mblock_mperblock_nblock_nperblock);
+    }
+
+    template <typename AGridDesc_AK0_M_K1,
+              typename BGridDesc_BPreshuffled,
+              typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+              bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run_2Lds(const ADataType* p_a_grid,
+                                    const BDataType* p_b_grid,
+                                    CDataType* p_c_grid,
+                                    void* p_shared_0,
+                                    void* p_shared_1,
+                                    const Problem& problem,
+                                    const AGridDesc_AK0_M_K1& a_grid_desc_ak0_m_ak1,
+                                    const BGridDesc_BPreshuffled& b_grid_desc_bpreshuffled,
+                                    const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
+                                        c_grid_desc_mblock_mperblock_nblock_nperblock)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_bpreshuffled.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+        const AElementwiseOperation a_element_op{};
+        // const BElementwiseOperation b_element_op{};
+        const CElementwiseOperation c_element_op{};
+
+        // divide block work by [M, N]
+        const auto block_2_ctile_map = Block2CTileMap{problem.M, problem.N, 4};
+
+        const auto block_work_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[I0]);
+        const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[I1]);
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_m_id * MPerBlock);
+
+        // N0, K0, Blocksize*KPack
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_n_id * NXdlPerWave);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<AK0Number, MPerBlock, AK1Number>,
+                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ADataType,
+                                                ADataType,
+                                                decltype(a_grid_desc_ak0_m_ak1),
+                                                decltype(a_block_desc_ak0_m_ak1),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<0, 1, 2>,
+                                                ABlockTransferSrcVectorDim,
+                                                2,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_AK1,
+                                                1,
+                                                1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                2>(
+                a_grid_desc_ak0_m_ak1,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_block_desc_ak0_m_ak1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // B matrix blockwise copy
+        // Thread-wise copy
+        // K0 -> N0/NWave -> NWave -> KLane -> NLane -> KPack
+        auto b_block_buf_ping = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto b_block_buf_pong = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto b_block_bufs = make_tuple(b_block_buf_ping, b_block_buf_pong);
+
+        auto b_blockwise_copy = ThreadwiseTensorSliceTransfer_v2<
+            BDataType,
+            BDataType,
+            decltype(b_grid_desc_bpreshuffled),
+            decltype(b_block_desc_bk0_n_bk1),
+            Sequence<Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}>,
+            Sequence<1, 2, 0, 3>,
+            3,
+            BBlockTransferSrcScalarPerVector,
+            BThreadTransferSrcResetCoordinateAfterRun,
+            true>(b_grid_desc_bpreshuffled,
+                  make_multi_index(n_block_data_idx_on_grid,
+                                   get_warp_local_1d_id() % NWave,
+                                   0,
+                                   KPack * (get_thread_local_1d_id() % warpSize)));
+
+        // LDS allocation for A and B: be careful of alignment
+        auto a_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ADataType*>(p_shared_0), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto a_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ADataType*>(p_shared_1), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto a_block_bufs = make_tuple(a_block_buf_ping, a_block_buf_pong);
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(0, 0, KRepeat, 0);
+
+        // Blockwise GEMM pipeline
+        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
+        auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
+        auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(a_grid_desc_ak0_m_ak1,
+                                                                         a_block_desc_ak0_m_ak1,
+                                                                         a_blockwise_copy,
+                                                                         a_grid_buf,
+                                                                         a_block_bufs,
+                                                                         a_block_slice_copy_step,
+                                                                         b_grid_desc_bpreshuffled,
+                                                                         b_blockwise_copy,
+                                                                         b_grid_buf,
+                                                                         b_block_bufs,
+                                                                         b_block_slice_copy_step,
+                                                                         c_thread_buf,
+                                                                         num_k_block_main_loop);
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared_0),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
+                        M1,                                      // M1 = MWave
+                        M2,                                      // M2 * M3 * M4 = MPerXdl
+                        M3,
+                        M4)),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
+                        N1,                                      // N1 = NWave
+                        N2))),                                   // N2 = NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   CShuffleDataType,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+
+            // shuffle: blockwise copy C from LDS to global
+            auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
+                ThisThreadBlock,            // ThreadGroup
+                CElementwiseOperation,      // ElementwiseOperation,
+                CGlobalMemoryDataOperation, // DstInMemOp,
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                CShuffleDataType,     // typename SrcData,
+                CDataType,            // typename DstData,
+                decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                Sequence<0, 1, 2, 3>,                           // typename DimAccessOrder,
+                3,                                              // index_t VectorDim,
+                CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector,
+                true,  // bool ThreadTransferSrcResetCoordinateAfterRun,
+                false> // bool ThreadTransferDstResetCoordinateAfterRun>
+                {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(0, 0, 0, 0),
+                 c_grid_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(block_m_id, 0, block_n_id, 0),
+                 c_element_op};
+
+            // space filling curve for threadwise C in VGPR
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle,
+                                           CShuffleNXdlPerWavePerShuffle,
+                                           1,
+                                           1,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>>{};
+
+            // space filling curve for shuffled blockwise C in global mem
+            constexpr auto sfc_c_global =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!");
+
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf,
+                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                c_shuffle_block_copy_lds_to_global.Run(
+                    c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                    c_shuffle_block_buf,
+                    c_grid_desc_mblock_mperblock_nblock_nperblock,
+                    c_grid_buf);
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
+
+                    // move on C
+                    c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow(
+                        c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
+                }
+            });
+        }
+    }
+
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run_2Lds(const ADataType* p_a_grid,
+                                    const BDataType* p_b_grid,
+                                    CDataType* p_c_grid,
+                                    void* p_shared_0,
+                                    void* p_shared_1,
+                                    const Problem& problem)
+    {
+        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+            problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
+        const auto b_grid_desc_bpreshuffled =
+            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(
+            problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
+
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+        Run_2Lds<decltype(a_grid_desc_ak0_m_ak1),
+                 decltype(b_grid_desc_bpreshuffled),
+                 decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                 HasMainKBlockLoop,
+                 CGlobalMemoryDataOperation,
+                 TailNum>(p_a_grid,
+                          p_b_grid,
+                          p_c_grid,
+                          p_shared_0,
+                          p_shared_1,
+                          problem,
+                          a_grid_desc_ak0_m_ak1,
+                          b_grid_desc_bpreshuffled,
+                          c_grid_desc_mblock_mperblock_nblock_nperblock);
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
index 2e62110416..7124687d5d 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -13,6 +13,7 @@
 #include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
 #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 
 namespace ck {
 
@@ -157,8 +158,9 @@ struct GridwiseGemm_xdl_cshuffle_v3
 
     static constexpr auto lcm_AK1_BK1 = math::lcm(AK1Number, BK1Number);
     static constexpr bool is_single_rate_mfma =
-        ((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
-         lcm_AK1_BK1 <= 4)
+        (((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
+          lcm_AK1_BK1 <= 4) ||
+         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8))
             ? true
             : false;
     static constexpr index_t KPack =
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp
index f9071bd29d..ac3e821340 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp
@@ -1,9 +1,10 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
@@ -193,9 +194,17 @@ struct GridwiseGemm_xdl_cshuffle_v3
     using BsGridPointer = decltype(MakeBsGridPointer());
     using DsGridPointer = decltype(MakeDsGridPointer());
 
-    static constexpr index_t KPack = math::max(
-        math::lcm(AK1Number, BK1Number),
-        MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeB>::selected_mfma.k_per_blk);
+    static constexpr auto lcm_AK1_BK1 = math::lcm(AK1Number, BK1Number);
+    static constexpr bool is_single_rate_mfma =
+        (((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
+          lcm_AK1_BK1 <= 4) ||
+         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8))
+            ? true
+            : false;
+    static constexpr index_t KPack =
+        math::max(lcm_AK1_BK1,
+                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeB, is_single_rate_mfma>::
+                      selected_mfma.k_per_blk);
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
index a9e73bf461..4163d1d01a 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
@@ -179,9 +179,18 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
 
     using DsGridPointer = decltype(MakeDsGridPointer());
 
-    static constexpr index_t KPack = math::max(
-        math::lcm(AK1Number, BK1Number),
-        MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeB>::selected_mfma.k_per_blk);
+    static constexpr auto lcm_AK1_BK1 = math::lcm(AK1Number, BK1Number);
+    static constexpr bool is_single_rate_mfma =
+        (((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
+          lcm_AK1_BK1 <= 4) ||
+         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8))
+            ? true
+            : false;
+
+    static constexpr index_t KPack =
+        math::max(lcm_AK1_BK1,
+                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeB, is_single_rate_mfma>::
+                      selected_mfma.k_per_blk);
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
index 813acfa656..21812380c2 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
@@ -149,9 +149,18 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
 
     using DsGridPointer = decltype(MakeDsGridPointer());
 
-    static constexpr index_t KPack = math::max(
-        math::lcm(AK1Number, BK1Number),
-        MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeB>::selected_mfma.k_per_blk);
+    static constexpr auto lcm_AK1_BK1 = math::lcm(AK1Number, BK1Number);
+    static constexpr bool is_single_rate_mfma =
+        (((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
+          lcm_AK1_BK1 <= 4) ||
+         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8))
+            ? true
+            : false;
+
+    static constexpr index_t KPack =
+        math::max(lcm_AK1_BK1,
+                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeB, is_single_rate_mfma>::
+                      selected_mfma.k_per_blk);
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
@@ -225,7 +234,7 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
             make_tuple(Sequence<3>{}, Sequence<0, 1, 2>{}));
     }
 
-    __device__ static auto MakeAGridDescriptor_AK0_M_AK1(
+    __host__ __device__ static auto MakeAGridDescriptor_AK0_M_AK1(
         index_t M, index_t MPad, index_t K, index_t KPad, index_t StrideA, index_t AK0)
     {
         const auto a_grid_desc_mraw_kraw = [&]() {
@@ -307,7 +316,7 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
         }
     }
 
-    __device__ static auto MakeBGridDescriptor_BK0_N_BK1(
+    __host__ __device__ static auto MakeBGridDescriptor_BK0_N_BK1(
         index_t K, index_t KPad, index_t N, index_t NPad, index_t StrideB, index_t BK0)
     {
         const auto b_grid_desc_nraw_kraw = [&]() {
@@ -422,6 +431,13 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
             }
         }();
 
+        // pad M and N
+        return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                           make_tuple(make_right_pad_transform(M, MPad - M),
+                                                      make_right_pad_transform(N, NPad - N)),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}));
+#if 0
         using GemmSpecialization = tensor_operation::device::GemmSpecialization;
 
         if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
@@ -459,6 +475,7 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
             // not pad M or N
             return c_grid_desc_mraw_nraw;
         }
+#endif
     }
 
     __host__ __device__ static auto MakeDsGridDescriptor_M_N(
@@ -656,40 +673,19 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
         // in some cases.
         else if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
         {
-            constexpr auto MLdsLayer        = 32 * 4 / KPerBlock / sizeof(LDSTypeA) < 1
-                                                  ? 1
-                                                  : 32 * 4 / KPerBlock / sizeof(LDSTypeA);
-            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor(
-                make_tuple(
-                    AK0Number * Number<MLdsLayer>{}, Number<MPerBlock / MLdsLayer>{}, AK1Number),
-                make_tuple(AK1Number, Number<KPerBlock * MLdsLayer>{}, I1));
+            constexpr auto a_lds_block_desc =
+                make_naive_tensor_descriptor(make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
+                                             make_tuple(AK1Number, Number<KPerBlock>{}, I1));
 
             constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
                 a_lds_block_desc,
-                make_tuple(make_xor_with_modulo_transform(make_tuple(
-                               Number<MPerBlock / MLdsLayer>{}, Number<AK0Number * MLdsLayer>{})),
+                make_tuple(make_xor_with_modulo_transform(
+                               make_tuple(Number<MPerBlock>{}, Number<AK0Number>{})),
                            make_pass_through_transform(AK1Number)),
                 make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
                 make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
 
-            constexpr auto a_lds_block_desc_ak0_mldslayer_m_ak1 = transform_tensor_descriptor(
-                a_lds_block_desc_permuted,
-                make_tuple(make_unmerge_transform(make_tuple(AK0Number, Number<MLdsLayer>{})),
-                           make_pass_through_transform(Number<MPerBlock / MLdsLayer>{}),
-                           make_pass_through_transform(AK1Number)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{}));
-
-            constexpr auto a_lds_block_desc_ak0_m_ak1 = transform_tensor_descriptor(
-                a_lds_block_desc_ak0_mldslayer_m_ak1,
-                make_tuple(make_pass_through_transform(AK0Number),
-                           make_merge_transform_v3_division_mod(
-                               make_tuple(Number<MPerBlock / MLdsLayer>{}, Number<MLdsLayer>{})),
-                           make_pass_through_transform(AK1Number)),
-                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
-            return a_lds_block_desc_ak0_m_ak1;
+            return a_lds_block_desc_permuted;
         }
         else // ColumnMajor A
         {
@@ -791,42 +787,19 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
         }
         else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
         {
-            // NLdsLayer * K0 as logical Bank
-            constexpr auto NLdsLayer = 32 * 4 / KPerBlock / sizeof(LDSTypeB) < 1
-                                           ? 1
-                                           : 32 * 4 / KPerBlock / sizeof(LDSTypeB);
-            ;
-            constexpr auto b_lds_block_desc = make_naive_tensor_descriptor(
-                make_tuple(
-                    BK0Number * Number<NLdsLayer>{}, Number<NPerBlock / NLdsLayer>{}, BK1Number),
-                make_tuple(BK1Number, Number<KPerBlock * NLdsLayer>{}, I1));
+            constexpr auto b_lds_block_desc =
+                make_naive_tensor_descriptor(make_tuple(BK0Number, Number<NPerBlock>{}, BK1Number),
+                                             make_tuple(BK1Number, Number<KPerBlock>{}, I1));
 
             constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
                 b_lds_block_desc,
-                make_tuple(make_xor_with_modulo_transform(make_tuple(
-                               Number<NPerBlock / NLdsLayer>{}, Number<BK0Number * NLdsLayer>{})),
+                make_tuple(make_xor_with_modulo_transform(
+                               make_tuple(Number<NPerBlock>{}, Number<BK0Number>{})),
                            make_pass_through_transform(BK1Number)),
                 make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
                 make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
 
-            constexpr auto b_lds_block_desc_bk0_nldslayer_n_bk1 = transform_tensor_descriptor(
-                b_lds_block_desc_permuted,
-                make_tuple(make_unmerge_transform(make_tuple(BK0Number, Number<NLdsLayer>{})),
-                           make_pass_through_transform(Number<NPerBlock / NLdsLayer>{}),
-                           make_pass_through_transform(BK1Number)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{}));
-
-            constexpr auto b_lds_block_desc_bk0_n_bk1 = transform_tensor_descriptor(
-                b_lds_block_desc_bk0_nldslayer_n_bk1,
-                make_tuple(make_pass_through_transform(BK0Number),
-                           make_merge_transform_v3_division_mod(
-                               make_tuple(Number<NPerBlock / NLdsLayer>{}, Number<NLdsLayer>{})),
-                           make_pass_through_transform(BK1Number)),
-                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
-            return b_lds_block_desc_bk0_n_bk1;
+            return b_lds_block_desc_permuted;
         }
         else // RowMajor B
         {
@@ -992,7 +965,8 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
         if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
                        GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
                        GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     !(is_same<tensor_layout::gemm::RowMajor, ALayout>::value))
         {
             if(!(karg.M % MPerBlock == 0))
             {
@@ -1009,7 +983,8 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
         if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
                        GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
                        GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     (is_same<tensor_layout::gemm::RowMajor, BLayout>::value))
         {
             if(!(karg.N % NPerBlock == 0))
             {
@@ -1357,28 +1332,39 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
             (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
             KPerBlock);
 
-        const index_t ScaleSliceSizeM = 1;
-        const index_t ScaleSliceSizeN = 1;
-        const index_t ScaleSliceSizeK = 1;
+        constexpr index_t ScaleSliceSizeM = MXdlPerWave;
+        constexpr index_t ScaleSliceSizeN = math::integer_divide_ceil(NPerBlock, ScaleBlockN);
+        constexpr index_t ScaleSliceSizeK = math::integer_divide_ceil(KPerBlock, ScaleBlockK);
 
+        // ScaleSliceSizeK is last dimension in A/B scale for vector memory access
+        // ScaleSliceSizeK is first dimension in C scale for packed math
         constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
             make_tuple(Number<ScaleSliceSizeM>{}, Number<ScaleSliceSizeK>{}));
 
+        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
+        auto a_thread_offset =
+            get_thread_local_1d_id() % MPerXdl + (get_thread_local_1d_id() / 64) / NWaves * MPerXdl;
+
         constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
-            make_tuple(Number<ScaleSliceSizeM>{}, Number<ScaleSliceSizeK>{}));
+            make_tuple(Number<ScaleSliceSizeN>{}, Number<ScaleSliceSizeK>{}));
+
+        constexpr auto c_scale_thread_desc = make_naive_tensor_descriptor_packed(make_tuple(
+            Number<ScaleSliceSizeK>{}, Number<ScaleSliceSizeM>{}, Number<ScaleSliceSizeN>{}));
 
         auto a_scale_thread_copy =
             ThreadwiseTensorSliceTransfer_v2<AScaleType,
                                              AScaleType,
                                              decltype(a_scale_grid_desc_am_ak),
                                              decltype(a_scale_thread_desc),
-                                             Sequence<ScaleSliceSizeM, ScaleSliceSizeK>,
+                                             Sequence<1, ScaleSliceSizeK>,
                                              Sequence<0, 1>,
                                              1,
-                                             1,
+                                             ScaleSliceSizeK,
                                              1,
                                              false>(
-                a_scale_grid_desc_am_ak, make_multi_index(block_m_id * MPerBlock / ScaleBlockM, 0));
+                a_scale_grid_desc_am_ak,
+                make_multi_index(block_m_id * MPerBlock / ScaleBlockM + a_thread_offset, 0));
 
         auto b_scale_thread_copy =
             ThreadwiseTensorSliceTransfer_v2<BScaleType,
@@ -1388,17 +1374,21 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
                                              Sequence<ScaleSliceSizeN, ScaleSliceSizeK>,
                                              Sequence<0, 1>,
                                              1,
-                                             1,
+                                             ScaleSliceSizeK,
                                              1,
                                              false>(
                 b_scale_grid_desc_bn_ak, make_multi_index(block_n_id * NPerBlock / ScaleBlockN, 0));
 
-        constexpr auto a_scale_thread_slice_copy_step = make_multi_index(0, 1);
-        constexpr auto b_scale_thread_slice_copy_step = make_multi_index(0, 1);
+        // constexpr auto a_scale_thread_slice_copy_step = make_multi_index(0, 1);
+        constexpr auto a_scale_thread_slice_copy_step =
+            make_tuple(make_multi_index(MWaves * MPerXdl, 0),
+                       make_multi_index(-MPerBlock, 0),
+                       make_multi_index(-MPerBlock, ScaleSliceSizeK));
+        constexpr auto b_scale_thread_slice_copy_step = make_multi_index(0, ScaleSliceSizeK);
 
-        const index_t num_k_block_per_scale = ScaleBlockK / KPerBlock;
+        constexpr auto NumKBlockPerScale = math::integer_divide_ceil(ScaleBlockK, KPerBlock);
 
-        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, NumKBlockPerScale, TailNum>(
             a_grid_desc_ak0_m_ak1,
             a_block_desc_ak0_m_ak1,
             a_blockwise_copy,
@@ -1411,6 +1401,8 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
             b_grid_buf,
             b_block_buf,
             b_block_slice_copy_step,
+
+            c_scale_thread_desc,
             c_thread_buf,
 
             a_scale_grid_desc_am_ak,
@@ -1425,8 +1417,7 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
             b_scale_grid_buf,
             b_scale_thread_slice_copy_step,
 
-            num_k_block_main_loop,
-            num_k_block_per_scale);
+            num_k_block_main_loop);
 
         // shuffle C and write out
         {
@@ -1437,23 +1428,24 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
             constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
             constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
 
-            // TODO: hacky, fix it!
-            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
-                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+            // transposed XDL
+            // // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4 =
+                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4();
 
-            // TODO: hacky, fix it!
-            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
-            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
-                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+            // // TODO: hacky, fix it!
+            // only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp =
+                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4();
 
-            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
-            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
-            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
-            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
-            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
-            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
-            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
-            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I4);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I5);
+            constexpr auto N3 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I6);
+            constexpr auto N4 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I7);
 
             constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
                 GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
@@ -1462,24 +1454,24 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
                 static_cast<CShuffleDataType*>(p_shared),
                 c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
 
-            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4 = transform_tensor_descriptor(
                 c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
                 make_tuple(
                     make_freeze_transform(I0),
                     make_unmerge_transform(make_tuple(
                         Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
                         M1,                                      // M1 = MWave
-                        M2,                                      // M2 * M3 * M4 = MPerXdl
-                        M3,
-                        M4)),
+                        M2)),                                    // M2 = MPerXdl
                     make_freeze_transform(I0),
                     make_unmerge_transform(make_tuple(
                         Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
                         N1,                                      // N1 = NWave
-                        N2))),                                   // N2 = NPerXdl
+                        N2,                                      // N2 * N3 * N4 = NPerXdl
+                        N3,
+                        N4))),
                 make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
                 make_tuple(
-                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+                    Sequence<>{}, Sequence<0, 2, 4>{}, Sequence<>{}, Sequence<1, 3, 5, 6, 7>{}));
 
             // calculate origin of thread output tensor on global memory
             //     blockwise GEMM c matrix starting index
@@ -1489,57 +1481,57 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
             const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
             const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
 
-            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+            const auto m_thread_data_on_block_to_m0_m1_m2_adaptor =
                 make_single_stage_tensor_adaptor(
-                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
-                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
-                    make_tuple(Sequence<0>{}));
-
-            const auto m_thread_data_on_block_idx =
-                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
-                    make_multi_index(m_thread_data_on_block));
-
-            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
-                make_single_stage_tensor_adaptor(
-                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2))),
                     make_tuple(Sequence<0, 1, 2>{}),
                     make_tuple(Sequence<0>{}));
 
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_n3_n4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2, N3, N4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
             const auto n_thread_data_on_block_idx =
-                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                n_thread_data_on_block_to_n0_n1_n2_n3_n4_adaptor.CalculateBottomIndex(
                     make_multi_index(n_thread_data_on_block));
 
             // shuffle: threadwise copy C from VGPR to LDS
             auto c_thread_copy_vgpr_to_lds =
                 ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
                                                    CShuffleDataType,
-                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
-                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
-                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4),
+                                                   tensor_operation::element_wise::PassThrough,
                                                    Sequence<CShuffleMXdlPerWavePerShuffle,
                                                             CShuffleNXdlPerWavePerShuffle,
                                                             I1,
                                                             I1,
-                                                            M2,
                                                             I1,
-                                                            M4,
-                                                            I1>,
+                                                            N2,
+                                                            I1,
+                                                            N4>,
                                                    Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
                                                    7,
                                                    1,
                                                    InMemoryDataOperationEnum::Set,
                                                    1,
                                                    true>{
-                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4,
                     make_multi_index(0,
                                      0,
                                      m_thread_data_on_block_idx[I1],
                                      n_thread_data_on_block_idx[I1],
                                      m_thread_data_on_block_idx[I2],
-                                     m_thread_data_on_block_idx[I3],
-                                     m_thread_data_on_block_idx[I4],
-                                     n_thread_data_on_block_idx[I2]),
-                    ck::tensor_operation::element_wise::PassThrough{}};
+                                     n_thread_data_on_block_idx[I2],
+                                     n_thread_data_on_block_idx[I3],
+                                     n_thread_data_on_block_idx[I4]),
+                    tensor_operation::element_wise::PassThrough{}};
 
             using EDataType = CDataType;
 
@@ -1621,18 +1613,17 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
                  make_tuple(make_multi_index(block_m_id, 0, block_n_id, 0)),
                  c_element_op};
 
-            // space filling curve for threadwise C in VGPR
             constexpr auto sfc_c_vgpr =
-                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
+                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, 1, N2, 1, N4>,
                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
                                   Sequence<CShuffleMXdlPerWavePerShuffle,
                                            CShuffleNXdlPerWavePerShuffle,
                                            1,
                                            1,
-                                           M2,
                                            1,
-                                           M4,
-                                           1>>{};
+                                           N2,
+                                           1,
+                                           N4>>{};
 
             constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
 
@@ -1652,10 +1643,10 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
                 block_sync_lds();
 
                 // each thread write its data from VGPR to LDS
-                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4,
                                               sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
                                               c_thread_buf,
-                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4,
                                               c_shuffle_block_buf);
 
                 // make sure it's safe to read from LDS
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
new file mode 100644
index 0000000000..44d515e76c
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
@@ -0,0 +1,2293 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_selector.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
+
+namespace ck {
+
+// Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same
+// kernel function Blockers:
+// 1. Two separted declaration of __shared__ pointer is the key to make sure data access operate on
+// two lds chunks.
+// 2. Occupied __shared__ won't release until whole shader end, a.k.a AB and C may not use same lds
+// buffer when we declare __shared__ inside blkgemmpipe
+template <typename GridwiseGemm,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Full>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+    // __attribute__((amdgpu_waves_per_eu(1, 1)))
+    kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+
+    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+        karg.p_a_scale_grid + splitk_batch_offset.a_scale_k_split_offset,
+        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+        karg.p_b_scale_grid + splitk_batch_offset.b_scale_k_split_offset,
+        karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
+        p_shared,
+        karg);
+
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx9__))
+}
+
+template <typename GridwiseGemm,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Full>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+    // __attribute__((amdgpu_waves_per_eu(1, 1)))
+    kernel_gemm_xdl_cshuffle_v3_2lds(typename GridwiseGemm::Argument karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+    // Pass two lds pointer is the key to tell compiler that ds_read/write
+    // operate on different lds chunk at same time without order dependecy
+    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+
+    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+        karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
+        karg.p_b_scale_grid + splitk_batch_offset.scale_k_split_offset,
+        p_shared_0,
+        p_shared_1,
+        karg);
+
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx9__))
+}
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          tensor_operation::device::GemmSpecialization GemmSpec,
+          index_t ScaleBlockSize, // Scaling block size
+          index_t BlockSize,      // Thread block size
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1Value,
+          index_t BK1Value,
+          index_t MPerXdl,
+          index_t NPerXdl,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v3,
+          typename ComputeTypeA =
+              ADataType, // XXX: These should always be the same as ADataType and BDataType
+          typename ComputeTypeB =
+              BDataType, // TODO: Hardcode them and remove from the list of template parameters
+          bool PermuteA = false,
+          bool PermuteB = false>
+struct GridwiseGemmMX_xdl_cshuffle_v3
+{
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    // K1 should be Number<...>
+    static constexpr auto AK0Number = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0Number = Number<KPerBlock / BK1Value>{};
+    static constexpr auto AK1Number = Number<AK1Value>{};
+    static constexpr auto BK1Number = Number<BK1Value>{};
+
+    static constexpr auto lcm_AK1_BK1         = math::lcm(AK1Number, BK1Number);
+    static constexpr bool is_single_rate_mfma = false;
+    static constexpr auto is_scale_mfma       = true;
+
+    //> KPack is at least the k_per_blk of selected mfma
+    //
+    // Should be a multiple of k_per_blk.
+    // TODO: Move this to blockwise pipeline base
+    static constexpr index_t KPack =
+        math::max(lcm_AK1_BK1,
+                  MfmaSelector<ComputeTypeA,
+                               MPerXdl,
+                               NPerXdl,
+                               ComputeTypeB,
+                               is_single_rate_mfma,
+                               is_scale_mfma>::selected_mfma.k_per_blk);
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    static constexpr index_t APackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    static constexpr index_t BPackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    __host__ static auto CalculateGridSize(index_t M, index_t N, index_t KBatch)
+    {
+        return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, KBatch);
+    }
+
+    __host__ static auto CalculateMPadded(index_t M)
+    {
+        return math::integer_least_multiple(M, MPerBlock);
+    }
+
+    __host__ static auto CalculateNPadded(index_t N)
+    {
+        return math::integer_least_multiple(N, NPerBlock);
+    }
+
+    __host__ static auto CalculateKPadded(index_t K)
+    {
+        return math::integer_divide_ceil(K, KPerBlock) * KPerBlock;
+    }
+
+    __host__ static auto CalculateAK0Padded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * (KPerBlock / AK1Value);
+    }
+
+    __host__ static auto CalculateBK0Padded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * (KPerBlock / BK1Value);
+    }
+
+    __host__ static auto CalculateKPadded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * KPerBlock;
+    }
+
+    __host__ static auto CalculateKRead(index_t K, index_t K_Batch = 1)
+    {
+        constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
+        auto K_t                = K_Batch * KReadVec;
+        return (K + K_t - 1) / K_t * KReadVec;
+    }
+
+    __host__ static auto CalculateMBlock(index_t M)
+    {
+        return math::integer_divide_ceil(M, MPerBlock);
+    }
+
+    __host__ static auto CalculateNBlock(index_t N)
+    {
+        return math::integer_divide_ceil(N, NPerBlock);
+    }
+
+    template <index_t MNXdlPerWave, index_t MNWaves, index_t MNPerXdl, typename TileDesc_K0_MN_K1>
+    __host__ __device__ static constexpr auto MakeGemmMmaTileDescriptor(const TileDesc_K0_MN_K1&)
+    {
+        constexpr index_t K0 = TileDesc_K0_MN_K1{}.GetLength(Number<0>{});
+        constexpr index_t K1 = TileDesc_K0_MN_K1{}.GetLength(Number<2>{});
+
+        return transform_tensor_descriptor(
+            TileDesc_K0_MN_K1{},
+            make_tuple(make_merge_transform_v3_division_mod(make_tuple(Number<K0>{}, Number<K1>{})),
+                       make_unmerge_transform(make_tuple(
+                           Number<MNXdlPerWave>{}, Number<MNWaves>{}, Number<MNPerXdl>{}))),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
+            make_tuple(Sequence<3>{}, Sequence<0, 1, 2>{}));
+    }
+
+    __host__ __device__ static auto MakeAGridDescriptor_AK0_M_AK1(
+        index_t M, index_t MPad, index_t K, index_t KPad, index_t StrideA, index_t AK0)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both M and K
+            const auto a_grid_desc_m_k =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_right_pad_transform(M, MPad - M),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(MPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad M, but not K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_right_pad_transform(M, MPad - M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad K, but not M
+            const auto a_grid_desc_m_k = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else
+        {
+            // not pad M or K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+    }
+
+    __host__ __device__ static auto MakeBGridDescriptor_BK0_N_BK1(
+        index_t K, index_t KPad, index_t N, index_t NPad, index_t StrideB, index_t BK0)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(StrideB, I1));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        static_assert(!(is_same_v<remove_cvref_t<ADataType>, pk_i4_t> &&
+                        GemmSpec != GemmSpecialization::Default),
+                      "pk_i4_t does not support padding");
+
+        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both N and K
+            const auto b_grid_desc_n_k =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_right_pad_transform(N, NPad - N),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(NPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad N, but not K
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad K, but not N
+            const auto b_grid_desc_n_k = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_pass_through_transform(N), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else
+        {
+            if constexpr(!PermuteB)
+            {
+                // not pad N or K
+                const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                    b_grid_desc_nraw_kraw,
+                    make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                               make_pass_through_transform(N)),
+                    make_tuple(Sequence<1>{}, Sequence<0>{}),
+                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+                return b_grid_desc_bk0_n_bk1;
+            }
+            else
+            {
+                // Weight Tile Permute
+                constexpr index_t BK01 = KPerBlock / BK1Value;
+                // const index_t BK00     = BK0 / BK01;
+                const index_t BK0_ = StrideB / BK1Value;
+                const index_t BK00 = BK0_ / BK01;
+
+                const auto b_grid_desc_bk00_n_bk01_bk1_permute =
+                    make_naive_tensor_descriptor_packed(make_tuple(BK00, N, BK01, BK1Value));
+
+                const auto b_grid_desc_bk0_n_bk1_permute = transform_tensor_descriptor(
+                    b_grid_desc_bk00_n_bk01_bk1_permute,
+                    make_tuple(make_merge_transform(make_tuple(BK00, BK01)),
+                               make_pass_through_transform(make_tuple(N)),
+                               make_pass_through_transform(BK1Value)),
+                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+                return b_grid_desc_bk0_n_bk1_permute;
+            }
+        }
+    }
+
+    template <typename ABlockDesc_AK0_M_AK1>
+    __host__ __device__ static constexpr auto
+    MakeAMmaTileDescriptor_M0_M1_M2_K(const ABlockDesc_AK0_M_AK1&)
+    {
+        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
+
+        return MakeGemmMmaTileDescriptor<MXdlPerWave, MWaves, MPerXdl>(ABlockDesc_AK0_M_AK1{});
+    }
+
+    template <typename BBlockDesc_BK0_N_BK1>
+    __host__ __device__ static constexpr auto
+    MakeBMmaTileDescriptor_N0_N1_N2_K(const BBlockDesc_BK0_N_BK1&)
+    {
+        constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        return MakeGemmMmaTileDescriptor<NXdlPerWave, NWaves, NPerXdl>(BBlockDesc_BK0_N_BK1{});
+    }
+
+    __host__ __device__ static auto
+    MakeCGridDescriptor_M_N(index_t M, index_t MPad, index_t N, index_t NPad, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        // pad M and N
+        return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                           make_tuple(make_right_pad_transform(M, MPad - M),
+                                                      make_right_pad_transform(N, NPad - N)),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}));
+#if 0
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad M and N
+            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                               make_tuple(make_right_pad_transform(M, MPad - M),
+                                                          make_right_pad_transform(N, NPad - N)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad M, but not N
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad N, but not M
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            // not pad M or N
+            return c_grid_desc_mraw_nraw;
+        }
+#endif
+    }
+
+    struct Problem
+    {
+        __host__ Problem(index_t M_,
+                         index_t N_,
+                         index_t K_,
+                         index_t StrideA_,
+                         index_t StrideScaleA_,
+                         index_t StrideB_,
+                         index_t StrideScaleB_,
+                         index_t StrideC_,
+                         index_t KBatch_)
+            : M{M_},
+              N{N_},
+              K{K_},
+              StrideA{StrideA_},
+              StrideScaleA{StrideScaleA_},
+              StrideB{StrideB_},
+              StrideScaleB{StrideScaleB_},
+              StrideC{StrideC_},
+              KBatch{KBatch_},
+              MPadded{CalculateMPadded(M_)},
+              NPadded{CalculateNPadded(N_)},
+              KRead{CalculateKRead(K_, KBatch_)},
+              KPadded{CalculateKPadded(K_, KBatch_)},
+              AK0{CalculateAK0Padded(K_, KBatch_)},
+              BK0{CalculateBK0Padded(K_, KBatch_)},
+              MBlock{CalculateMBlock(M_)},
+              NBlock{CalculateNBlock(N_)}
+        {
+        }
+
+        __host__ void Print() const
+        {
+            std::cout << "problem {"
+                      << "M:" << M << ", "
+                      << "N:" << N << ", "
+                      << "K:" << K << ", "
+                      << "SA:" << StrideA << ", "
+                      << "SScaleA:" << StrideScaleA << ", "
+                      << "SB:" << StrideB << ", "
+                      << "SScaleB:" << StrideScaleB << ", "
+                      << "SC:" << StrideC << ", "
+                      << "MP:" << MPadded << ", "
+                      << "NP:" << NPadded << ", "
+                      << "KRead:" << KRead << ", "
+                      << "KP:" << KPadded << ", "
+                      << "AK0:" << AK0 << ", "
+                      << "BK0:" << BK0 << ", "
+                      << "MBlock: " << MBlock << ", "
+                      << "NBlock: " << NBlock << "}" << std::endl;
+        }
+
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t StrideA;
+        index_t StrideScaleA;
+        index_t StrideB;
+        index_t StrideScaleB;
+        index_t StrideC;
+        index_t KBatch;
+        index_t MPadded;
+        index_t NPadded;
+        index_t KRead;
+        index_t KPadded;
+        index_t AK0;
+        index_t BK0;
+        index_t MBlock;
+        index_t NBlock;
+    };
+
+    // Argument
+    struct Argument : public tensor_operation::device::BaseArgument, public Problem
+    {
+        __host__ Argument(const ADataType* p_a_grid_,
+                          const AScaleDataType* p_a_scale_grid_,
+                          const BDataType* p_b_grid_,
+                          const BScaleDataType* p_b_scale_grid_,
+                          CDataType* p_c_grid_,
+                          index_t M_,
+                          index_t N_,
+                          index_t K_,
+                          index_t StrideA_,
+                          index_t StrideScaleA_,
+                          index_t StrideB_,
+                          index_t StrideScaleB_,
+                          index_t StrideC_,
+                          index_t k_batch_,
+                          AElementwiseOperation a_element_op_,
+                          BElementwiseOperation b_element_op_,
+                          CElementwiseOperation c_element_op_,
+                          bool is_reduce_ = false)
+            : Problem{M_,
+                      N_,
+                      K_,
+                      StrideA_,
+                      StrideScaleA_,
+                      StrideB_,
+                      StrideScaleB_,
+                      StrideC_,
+                      k_batch_},
+              p_a_grid{p_a_grid_},
+              p_a_scale_grid{p_a_scale_grid_},
+              p_b_grid{p_b_grid_},
+              p_b_scale_grid{p_b_scale_grid_},
+              p_c_grid{p_c_grid_},
+              a_element_op{a_element_op_},
+              b_element_op{b_element_op_},
+              c_element_op{c_element_op_},
+              is_reduce(is_reduce_)
+        {
+        }
+
+        __host__ __device__ inline bool IsReduceAdd() const
+        {
+            return (Problem::KBatch > 1) && is_reduce;
+        }
+
+        __host__ __device__ inline bool IsAtomicAdd() const
+        {
+            return (Problem::KBatch > 1) && (!is_reduce);
+        }
+
+        const ADataType* p_a_grid;
+        const AScaleDataType* p_a_scale_grid;
+        const BDataType* p_b_grid;
+        const BScaleDataType* p_b_scale_grid;
+        CDataType* p_c_grid;
+
+        const AElementwiseOperation a_element_op;
+        const BElementwiseOperation b_element_op;
+        const CElementwiseOperation c_element_op;
+        bool is_reduce;
+    };
+
+    struct SplitKBatchOffset
+    {
+
+        __device__ SplitKBatchOffset(Argument& karg, index_t k_id)
+        {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                a_k_split_offset = k_id * karg.KRead / APackedSize;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                a_k_split_offset = k_id * karg.KRead * karg.StrideA;
+            }
+
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
+            {
+                b_k_split_offset = k_id * karg.KRead * karg.StrideB;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
+            {
+                if constexpr(!PermuteB)
+                {
+                    b_k_split_offset = k_id * karg.KRead / BPackedSize;
+                }
+                else
+                {
+                    const int k0_offset = karg.KRead * karg.N;
+                    b_k_split_offset    = k_id * k0_offset / BPackedSize;
+                }
+            }
+
+            // Calculate A scale offset
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                a_scale_k_split_offset = k_id * karg.KRead / ScaleBlockSize;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                a_scale_k_split_offset = k_id * karg.KRead / ScaleBlockSize * karg.StrideScaleA;
+            }
+
+            // Calculate B scale offset
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
+            {
+                b_scale_k_split_offset = k_id * (karg.KRead / ScaleBlockSize) * karg.StrideScaleB;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
+            {
+                b_scale_k_split_offset = k_id * karg.KRead / ScaleBlockSize;
+            }
+
+            if(k_id < (karg.KBatch - 1))
+            {
+                karg.K = karg.KRead;
+            }
+            else
+            {
+                karg.K = karg.K - karg.KRead * (karg.KBatch - 1);
+            }
+
+            if(karg.IsReduceAdd())
+            {
+                c_reduce_offset = k_id * karg.M * karg.N;
+            }
+            else
+            {
+                c_reduce_offset = 0;
+            }
+        }
+
+        index_t a_k_split_offset;
+        index_t b_k_split_offset;
+        index_t a_scale_k_split_offset; // New member for scale matrix offset
+        index_t b_scale_k_split_offset; // New member for scale matrix offset
+        index_t c_reduce_offset;
+    };
+
+    __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    {
+        // A matrix in LDS memory, dst of blockwise copy
+        if constexpr(ABlockLdsExtraM || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+        {
+            return make_naive_tensor_descriptor(
+                make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
+                make_tuple(AK1Number, Number<KPerBlock + ABlockLdsExtraM>{}, I1));
+        }
+        // xor tensor transformation request more unnecessary vgpr usage, would cause register spill
+        // in some cases.
+        else if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            constexpr index_t LdsSize       = 32 * 4 / KPerBlock / sizeof(ADataType) / APackedSize;
+            constexpr auto MLdsLayer        = LdsSize < 1 ? 1 : LdsSize;
+            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor(
+                make_tuple(
+                    AK0Number * Number<MLdsLayer>{}, Number<MPerBlock / MLdsLayer>{}, AK1Number),
+                make_tuple(AK1Number, Number<KPerBlock * MLdsLayer>{}, I1));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(make_xor_with_modulo_transform(make_tuple(
+                               Number<MPerBlock / MLdsLayer>{}, Number<AK0Number * MLdsLayer>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
+
+            constexpr auto a_lds_block_desc_ak0_mldslayer_m_ak1 = transform_tensor_descriptor(
+                a_lds_block_desc_permuted,
+                make_tuple(make_unmerge_transform(make_tuple(AK0Number, Number<MLdsLayer>{})),
+                           make_pass_through_transform(Number<MPerBlock / MLdsLayer>{}),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{}));
+
+            constexpr auto a_lds_block_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_lds_block_desc_ak0_mldslayer_m_ak1,
+                make_tuple(make_pass_through_transform(AK0Number),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<MPerBlock / MLdsLayer>{}, Number<MLdsLayer>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return a_lds_block_desc_ak0_m_ak1;
+        }
+        else // ColumnMajor A
+        {
+            // kfold and mpair dimension is not always required.
+            // more dimension in merge_transform increase the difficulty of generating immarg offset
+            // for compiler.
+            constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+            constexpr auto M1 = MPerBlock / M0;
+
+            constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
+            constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
+            constexpr auto KThreadRead      = BlockwiseGemmPipe::WaveSize / MPerXdl;
+            constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
+
+            constexpr auto kfold = (AK1Number * M0 * sizeof(ADataType) > 128)
+                                       ? 1
+                                       : 128 / (AK1Number * M0 * sizeof(ADataType));
+            constexpr auto KThreadReadPerm =
+                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
+                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
+                    : KThreadRead;
+
+            // 1<=mpair<=n0
+            constexpr auto mpair = (AK1Number * MPerXdl * sizeof(ADataType) > 128)
+                                       ? 1
+                                       : ((128 / (AK1Number * MPerXdl * sizeof(ADataType))) > M0
+                                              ? M0
+                                              : 128 / (AK1Number * MPerXdl * sizeof(ADataType)));
+
+            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor_packed(
+                make_tuple(Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                           Number<K0PerThreadWrite>{},
+                           Number<KThreadReadPerm * M1>{},
+                           Number<kfold * M0 / mpair>{},
+                           Number<mpair>{},
+                           AK1Number));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_xor_with_modulo_transform(
+                        make_tuple(Number<KThreadReadPerm * M1>{}, Number<kfold * M0 / mpair>{})),
+                    make_pass_through_transform(Number<mpair>{}),
+                    make_pass_through_transform(AK1Number)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}));
+
+            constexpr auto a_lds_block_desc_unmerged = transform_tensor_descriptor(
+                a_lds_block_desc_permuted,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_unmerge_transform(make_tuple(Number<KThreadReadPerm>{}, Number<M1>{})),
+                    make_unmerge_transform(make_tuple(Number<kfold>{}, Number<M0 / mpair>{})),
+                    make_pass_through_transform(Number<mpair>{}),
+                    make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0>{},
+                           Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<5>{}),
+                make_tuple(Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<0, 3>{},
+                           Sequence<4, 5>{},
+                           Sequence<6>{},
+                           Sequence<7>{}));
+
+            constexpr auto a_lds_block_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_lds_block_desc_unmerged,
+                make_tuple(make_merge_transform_v3_division_mod(
+                               make_tuple(Number<KThreadReadPerm>{},
+                                          Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                                          Number<kfold>{},
+                                          Number<K0PerThreadWrite>{})),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<M0 / mpair>{}, Number<mpair>{}, Number<M1>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0, 1, 4, 2>{}, Sequence<5, 6, 3>{}, Sequence<7>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return a_lds_block_desc_ak0_m_ak1;
+        }
+    }
+
+    __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // B matrix in LDS memory, dst of blockwise copy
+        if constexpr(BBlockLdsExtraN || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+        {
+            return make_naive_tensor_descriptor(
+                make_tuple(BK0Number, Number<NPerBlock>{}, BK1Number),
+                make_tuple(BK1Number, Number<KPerBlock + BBlockLdsExtraN>{}, I1));
+        }
+        else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+        {
+            // NLdsLayer * K0 as logical Bank
+            constexpr index_t LdsSize       = 32 * 4 / KPerBlock / sizeof(BDataType) / BPackedSize;
+            constexpr index_t NLdsLayer     = LdsSize < 1 ? 1 : LdsSize;
+            constexpr auto b_lds_block_desc = make_naive_tensor_descriptor(
+                make_tuple(
+                    BK0Number * Number<NLdsLayer>{}, Number<NPerBlock / NLdsLayer>{}, BK1Number),
+                make_tuple(BK1Number, Number<KPerBlock * NLdsLayer>{}, I1));
+
+            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
+                b_lds_block_desc,
+                make_tuple(make_xor_with_modulo_transform(make_tuple(
+                               Number<NPerBlock / NLdsLayer>{}, Number<BK0Number * NLdsLayer>{})),
+                           make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
+
+            constexpr auto b_lds_block_desc_bk0_nldslayer_n_bk1 = transform_tensor_descriptor(
+                b_lds_block_desc_permuted,
+                make_tuple(make_unmerge_transform(make_tuple(BK0Number, Number<NLdsLayer>{})),
+                           make_pass_through_transform(Number<NPerBlock / NLdsLayer>{}),
+                           make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{}));
+
+            constexpr auto b_lds_block_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_lds_block_desc_bk0_nldslayer_n_bk1,
+                make_tuple(make_pass_through_transform(BK0Number),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<NPerBlock / NLdsLayer>{}, Number<NLdsLayer>{})),
+                           make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return b_lds_block_desc_bk0_n_bk1;
+        }
+        else // RowMajor B
+        {
+            constexpr auto N0 = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I1);
+            constexpr auto N1 = NPerBlock / N0;
+
+            constexpr auto KThreadWrite     = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I0);
+            constexpr auto K0PerThreadWrite = BK0Number / KThreadWrite;
+            constexpr auto KThreadRead      = BlockwiseGemmPipe::WaveSize / NPerXdl;
+            constexpr auto K0PerThreadRead  = BK0Number / KThreadRead;
+
+            constexpr auto kfold = (BK1Number * N0 * sizeof(BDataType) > 128)
+                                       ? 1
+                                       : 128 / (BK1Number * N0 * sizeof(BDataType));
+            constexpr auto KThreadReadPerm =
+                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
+                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
+                    : KThreadRead;
+
+            // 1<=npair<=n0
+            constexpr auto npair = (BK1Number * NPerXdl * sizeof(BDataType) > 128)
+                                       ? 1
+                                       : ((128 / (BK1Number * NPerXdl * sizeof(BDataType))) > N0
+                                              ? N0
+                                              : 128 / (BK1Number * NPerXdl * sizeof(BDataType)));
+
+            constexpr auto b_lds_block_desc = make_naive_tensor_descriptor_packed(
+                make_tuple(Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                           Number<K0PerThreadWrite>{},
+                           Number<KThreadReadPerm * N1>{},
+                           Number<kfold * N0 / npair>{},
+                           Number<npair>{},
+                           BK1Number));
+
+            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
+                b_lds_block_desc,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_xor_with_modulo_transform(
+                        make_tuple(Number<KThreadReadPerm * N1>{}, Number<kfold * N0 / npair>{})),
+                    make_pass_through_transform(Number<npair>{}),
+                    make_pass_through_transform(BK1Number)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}));
+
+            constexpr auto b_lds_block_desc_unmerged = transform_tensor_descriptor(
+                b_lds_block_desc_permuted,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_unmerge_transform(make_tuple(Number<KThreadReadPerm>{}, Number<N1>{})),
+                    make_unmerge_transform(make_tuple(Number<kfold>{}, Number<N0 / npair>{})),
+                    make_pass_through_transform(Number<npair>{}),
+                    make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<0>{},
+                           Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<5>{}),
+                make_tuple(Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<0, 3>{},
+                           Sequence<4, 5>{},
+                           Sequence<6>{},
+                           Sequence<7>{}));
+
+            constexpr auto b_lds_block_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_lds_block_desc_unmerged,
+                make_tuple(make_merge_transform_v3_division_mod(
+                               make_tuple(Number<KThreadReadPerm>{},
+                                          Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                                          Number<kfold>{},
+                                          Number<K0PerThreadWrite>{})),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<N0 / npair>{}, Number<npair>{}, Number<N1>{})),
+                           make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<0, 1, 4, 2>{}, Sequence<5, 6, 3>{}, Sequence<7>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return b_lds_block_desc_bk0_n_bk1;
+        }
+    }
+
+    __device__ static constexpr auto GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
+    {
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            make_naive_tensor_descriptor_packed(
+                make_tuple(I1,
+                           Number<CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl>{},
+                           I1,
+                           Number<CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>{}));
+
+        return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    using BlockwiseGemmPipe =
+        remove_cvref_t<decltype(BlockGemmMXPipeline_Selector<
+                                BlkGemmPipelineVer,
+                                BlkGemmPipeSched,
+                                BlockSize,
+                                ScaleBlockSize,
+                                ADataType,
+                                AScaleDataType,
+                                BDataType,
+                                BScaleDataType,
+                                ComputeTypeA,
+                                AccDataType,
+                                decltype(GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()),
+                                decltype(GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()),
+                                decltype(MakeAMmaTileDescriptor_M0_M1_M2_K(
+                                    GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1())),
+                                decltype(MakeBMmaTileDescriptor_N0_N1_N2_K(
+                                    GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1())),
+                                ABlockTransferSrcScalarPerVector,
+                                BBlockTransferSrcScalarPerVector,
+                                MPerBlock,
+                                NPerBlock,
+                                KPerBlock,
+                                MPerXdl,
+                                NPerXdl,
+                                MXdlPerWave,
+                                NXdlPerWave,
+                                KPack>())>;
+
+    __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
+
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
+
+        // LDS allocation for C shuffle in LDS
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+        constexpr auto c_block_size =
+            c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
+
+        return math::max((a_block_space_size_aligned * sizeof(ADataType) / APackedSize +
+                          b_block_space_size_aligned * sizeof(BDataType) / BPackedSize),
+                         c_block_size * sizeof(CShuffleDataType));
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    __host__ static constexpr bool CheckValidity(const Argument& karg)
+    {
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
+                      "Invalid tuning param!");
+
+        static_assert(KPerBlock % ScaleBlockSize == 0,
+                      "KPerBlock should be multiple of ScaleBlockSize");
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     !(is_same<tensor_layout::gemm::RowMajor, ALayout>::value))
+        {
+            if(!(karg.M % MPerBlock == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M value is not a multiple of MPerBlock! M: " << karg.M << " "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     (is_same<tensor_layout::gemm::RowMajor, BLayout>::value))
+        {
+            if(!(karg.N % NPerBlock == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N value is not a multiple of NPerBlock! N: " << karg.N << " "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::KPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+            auto K_t = karg.KBatch * KPerBlock;
+            if(!(karg.K % K_t == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K value is not a multiple of K_Batch * K0PerBlock * K1! K: "
+                              << karg.K << " " << __FILE__ << ":" << __LINE__
+                              << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
+            auto K_t                = karg.KBatch * KReadVec;
+            auto KReadPadSplited    = math::integer_divide_ceil(karg.K, K_t) * KReadVec;
+            if((KReadPadSplited * (karg.KBatch - 1)) >= karg.K)
+            {
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            if(karg.K % ABlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K (" << karg.K
+                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % ABlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M (" << karg.M
+                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+        {
+            if(karg.N % BBlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N (" << karg.N
+                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.K % BBlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K (" << karg.K
+                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+        {
+            if(karg.N % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N (" << karg.N
+                              << ") value is not a multiple of "
+                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M (" << karg.M
+                              << ") value is not a multiple of "
+                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(is_same<remove_cvref_t<CDataType>, half_t>::value ||
+                       is_same<remove_cvref_t<CDataType>, float>::value ||
+                       is_same<remove_cvref_t<CDataType>, bhalf_t>::value ||
+                       is_same<remove_cvref_t<CDataType>, int32_t>::value))
+        {
+            if(!karg.IsReduceAdd())
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << " KBatch: " << karg.KBatch << " > 1 is not support yet" << __FILE__
+                              << ":" << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                if(karg.KBatch > 1)
+                {
+                    return false;
+                }
+            }
+        }
+
+        // check gridwise gemm pipeline
+        const auto num_k_loop = karg.AK0 / (KPerBlock / AK1Value);
+
+        if constexpr(BlkGemmPipelineVer != BlockGemmPipelineVersion::v1)
+        {
+            if(num_k_loop <= BlockwiseGemmPipe::PrefetchStages)
+            {
+                return false;
+            }
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return BlockwiseGemmPipe::BlockHasHotloop(num_loop);
+    }
+
+    __host__ static constexpr TailNumber CalculateKBlockLoopTailNum(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return BlockwiseGemmPipe::BlockLoopTailNum(num_loop);
+    }
+
+    template <typename CGridDesc>
+    __host__ __device__ static constexpr auto MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        const CGridDesc& c_grid_desc_m_n, index_t MBlock, index_t NBlock)
+    {
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+
+        return c_grid_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    // if arch = gfx942
+    using Block2CTileMap = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock, NPerBlock>;
+    // using Block2CTileMap = BlockToCTileMap_3DGrid_KSplit<MPerBlock, NPerBlock>;
+
+    template <typename AGridDesc_AK0_M_K1,
+              typename AScaleGridDesc_AM_AK,
+              typename BGridDesc_BK0_N_K1,
+              typename BScaleGridDesc_BN_AK,
+              typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+              bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run(const ADataType* p_a_grid,
+                               const AScaleDataType* p_a_scale_grid,
+                               const BDataType* p_b_grid,
+                               const BScaleDataType* p_b_scale_grid,
+                               CDataType* p_c_grid,
+                               void* p_shared,
+                               const Problem& problem,
+                               const AGridDesc_AK0_M_K1& a_grid_desc_ak0_m_ak1,
+                               const AScaleGridDesc_AM_AK& a_scale_grid_desc_am_ak,
+                               const BGridDesc_BK0_N_K1& b_grid_desc_bk0_n_bk1,
+                               const BScaleGridDesc_BN_AK& b_scale_grid_desc_bn_ak,
+                               const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
+                                   c_grid_desc_mblock_mperblock_nblock_nperblock)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+        // A Scale buffer
+        const auto a_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_scale_grid, a_scale_grid_desc_am_ak.GetElementSpaceSize());
+
+        // B Scale buffer
+        const auto b_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_scale_grid, b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+
+        const AElementwiseOperation a_element_op{};
+        const BElementwiseOperation b_element_op{};
+        const CElementwiseOperation c_element_op{};
+
+        // divide block work by [M, N]
+        const auto block_2_ctile_map = Block2CTileMap{problem.M, problem.N, 4};
+
+        const auto block_work_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[I0]);
+        const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[I1]);
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_m_id * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_n_id * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<AK0Number, MPerBlock, AK1Number>,
+                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ADataType,
+                                                ADataType,
+                                                decltype(a_grid_desc_ak0_m_ak1),
+                                                decltype(a_block_desc_ak0_m_ak1),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<0, 1, 2>,
+                                                ABlockTransferSrcVectorDim,
+                                                2,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_AK1,
+                                                1,
+                                                1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                BlockwiseGemmPipe::GlobalBufferNum>(
+                a_grid_desc_ak0_m_ak1,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_block_desc_ak0_m_ak1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<BK0Number, NPerBlock, BK1Number>,
+                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                BDataType,
+                                                BDataType,
+                                                decltype(b_grid_desc_bk0_n_bk1),
+                                                decltype(b_block_desc_bk0_n_bk1),
+                                                BBlockTransferSrcAccessOrder,
+                                                Sequence<0, 1, 2>,
+                                                BBlockTransferSrcVectorDim,
+                                                2,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_BK1,
+                                                1,
+                                                1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                BlockwiseGemmPipe::GlobalBufferNum>(
+                b_grid_desc_bk0_n_bk1,
+                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                b_element_op,
+                b_block_desc_bk0_n_bk1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        // Cast after lds
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            reinterpret_cast<BDataType*>(static_cast<char*>(p_shared) + a_block_space_size_aligned *
+                                                                            sizeof(ADataType) /
+                                                                            APackedSize),
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1Number, 0, 0);
+
+        // Blockwise GEMM pipeline
+        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
+        auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
+        auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        // Initial thread mapping for:
+        // BlockSize = 256
+        // MPerXdl=NPerXdl=32 and MPerBlock=NPerBlock=128 MRepeat=NRepeat=2 MWaves=NWaves=2
+        // For each [m0, n0] tile, there are 4 waves:
+        // tId in [  0,  63]  m x n = [ 0, 31] x [ 0, 31]  waveId = [0, 0]
+        // tId in [ 64, 127]  m x n = [ 0, 31] x [32, 63]  waveId = [0, 1]
+        // tId in [128, 191]  m x n = [32, 63] x [ 0, 31]  waveId = [1, 0]
+        // tId in [192, 255]  m x n = [32, 63] x [32, 63]  waveId = [1, 1]
+
+        // BlockSize = 128
+        // MPerXdl=NPerXdl=16 and MPerBlock=128 NPerBlock=16 MRepeat=4 NRepeat=1 MWaves=2 NWaves=1
+        // For each [m0, n0] tile, there are 2 waves:
+        // tId in [  0,  63]  m x n = [ 0, 15] x [0, 15]  waveId = [0, 0]
+        // tId in [ 64, 127]  m x n = [16, 31] x [0, 15]  waveId = [1, 0]
+
+        // TODO: Document initial thread mapping for more combinations of parameters
+
+        const auto wave_idx = BlockwiseGemmPipe::GetWaveIdx();
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
+
+        static constexpr auto mfma = BlockwiseGemmPipe::xdlops_gemm.mfma;
+
+        auto thread_offset_k = (get_thread_local_1d_id() % BlockwiseGemmPipe::WaveSize) /
+                               mfma.selected_mfma.num_threads_per_blk;
+
+        auto a_thread_offset_m = get_thread_local_1d_id() % MPerXdl + waveId_m * MPerXdl;
+
+        auto a_scale_thread_copy =
+            ThreadwiseTensorSliceTransfer_v2<AScaleDataType,
+                                             AScaleDataType,
+                                             decltype(a_scale_grid_desc_am_ak),
+                                             decltype(BlockwiseGemmPipe::a_scale_thread_desc_copy),
+                                             Sequence<1, 1>, // SliceLengths
+                                             Sequence<0, 1>, // DimAccessOrder
+                                             1,              // SrcVectorDim
+                                             1,              // SrcScalarPerVector
+                                             1,              // SrcScalarStrideInVector
+                                             true>(
+                a_scale_grid_desc_am_ak,
+                make_multi_index(block_m_id * MPerBlock + a_thread_offset_m, thread_offset_k));
+
+        auto b_thread_offset_n = get_thread_local_1d_id() % NPerXdl + waveId_n * NPerXdl;
+
+        auto b_scale_thread_copy =
+            ThreadwiseTensorSliceTransfer_v2<BScaleDataType,
+                                             BScaleDataType,
+                                             decltype(b_scale_grid_desc_bn_ak),
+                                             decltype(BlockwiseGemmPipe::b_scale_thread_desc_copy),
+                                             Sequence<1, 1>, // SliceLengths
+                                             Sequence<0, 1>, // DimAccessOrder
+                                             1,              // SrcVectorDim
+                                             1,              // SrcScalarPerVector
+                                             1,
+                                             true>(
+                b_scale_grid_desc_bn_ak,
+                make_multi_index(block_n_id * NPerBlock + b_thread_offset_n, thread_offset_k));
+
+        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(a_grid_desc_ak0_m_ak1,
+                                                                         a_block_desc_ak0_m_ak1,
+                                                                         a_blockwise_copy,
+                                                                         a_grid_buf,
+                                                                         a_block_buf,
+                                                                         a_block_slice_copy_step,
+                                                                         b_grid_desc_bk0_n_bk1,
+                                                                         b_block_desc_bk0_n_bk1,
+                                                                         b_blockwise_copy,
+                                                                         b_grid_buf,
+                                                                         b_block_buf,
+                                                                         b_block_slice_copy_step,
+                                                                         c_thread_buf,
+                                                                         a_scale_grid_desc_am_ak,
+                                                                         a_scale_thread_copy,
+                                                                         a_scale_grid_buf,
+                                                                         b_scale_grid_desc_bn_ak,
+                                                                         b_scale_thread_copy,
+                                                                         b_scale_grid_buf,
+                                                                         num_k_block_main_loop);
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+            constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
+                        M1,                                      // M1 = MWave
+                        M2,                                      // M2 * M3 * M4 = MPerXdl
+                        M3,
+                        M4)),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
+                        N1,                                      // N1 = NWave
+                        N2))),                                   // N2 = NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   CShuffleDataType,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+
+            // shuffle: blockwise copy C from LDS to global
+            auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
+                ThisThreadBlock,            // ThreadGroup
+                CElementwiseOperation,      // ElementwiseOperation,
+                CGlobalMemoryDataOperation, // DstInMemOp,
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                CShuffleDataType,     // typename SrcData,
+                CDataType,            // typename DstData,
+                decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                Sequence<0, 1, 2, 3>,                           // typename DimAccessOrder,
+                3,                                              // index_t VectorDim,
+                CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector,
+                true,  // bool ThreadTransferSrcResetCoordinateAfterRun,
+                false> // bool ThreadTransferDstResetCoordinateAfterRun>
+                {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(0, 0, 0, 0),
+                 c_grid_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(block_m_id, 0, block_n_id, 0),
+                 c_element_op};
+
+            // space filling curve for threadwise C in VGPR
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle,
+                                           CShuffleNXdlPerWavePerShuffle,
+                                           1,
+                                           1,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>>{};
+
+            // space filling curve for shuffled blockwise C in global mem
+            constexpr auto sfc_c_global =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!");
+
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf,
+                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                c_shuffle_block_copy_lds_to_global.Run(
+                    c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                    c_shuffle_block_buf,
+                    c_grid_desc_mblock_mperblock_nblock_nperblock,
+                    c_grid_buf);
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
+
+                    // move on C
+                    c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow(
+                        c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
+                }
+            });
+        }
+    }
+
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run(const ADataType* p_a_grid,
+                               const AScaleDataType* p_a_scale_grid,
+                               const BDataType* p_b_grid,
+                               const BScaleDataType* p_b_scale_grid,
+                               CDataType* p_c_grid,
+                               void* p_shared,
+                               const Problem& problem)
+    {
+        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+            problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
+        const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
+            problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(
+            problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+        // A Scale grid
+        const auto a_scale_grid_desc_am_ak = make_naive_tensor_descriptor(
+            make_tuple(problem.M, math::integer_divide_ceil(problem.K, ScaleBlockSize)),
+            make_tuple(problem.StrideScaleA, 1));
+
+        // B Scale grid transposed
+        const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor(
+            make_tuple(problem.N, math::integer_divide_ceil(problem.K, ScaleBlockSize)),
+            make_tuple(problem.StrideScaleB, 1));
+
+        Run<decltype(a_grid_desc_ak0_m_ak1),
+            decltype(a_scale_grid_desc_am_ak),
+            decltype(b_grid_desc_bk0_n_bk1),
+            decltype(b_scale_grid_desc_bn_ak),
+            decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+            HasMainKBlockLoop,
+            CGlobalMemoryDataOperation,
+            TailNum>(p_a_grid,
+                     p_a_scale_grid,
+                     p_b_grid,
+                     p_b_scale_grid,
+                     p_c_grid,
+                     p_shared,
+                     problem,
+                     a_grid_desc_ak0_m_ak1,
+                     a_scale_grid_desc_am_ak,
+                     b_grid_desc_bk0_n_bk1,
+                     b_scale_grid_desc_bn_ak,
+                     c_grid_desc_mblock_mperblock_nblock_nperblock);
+    }
+
+    template <typename AGridDesc_AK0_M_K1,
+              typename AScaleGridDesc_AM_AK,
+              typename BGridDesc_BK0_N_K1,
+              typename BScaleGridDesc_BN_AK,
+              typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+              bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run_2Lds(const ADataType* p_a_grid,
+                                    const AScaleDataType* p_a_scale_grid,
+                                    const BDataType* p_b_grid,
+                                    const BScaleDataType* p_b_scale_grid,
+                                    CDataType* p_c_grid,
+                                    void* p_shared_0,
+                                    void* p_shared_1,
+                                    const Problem& problem,
+                                    const AGridDesc_AK0_M_K1& a_grid_desc_ak0_m_ak1,
+                                    const AScaleGridDesc_AM_AK& a_scale_grid_desc_am_ak,
+                                    const BGridDesc_BK0_N_K1& b_grid_desc_bk0_n_bk1,
+                                    const BScaleGridDesc_BN_AK& b_scale_grid_desc_bn_ak,
+                                    const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
+                                        c_grid_desc_mblock_mperblock_nblock_nperblock)
+    {
+        ignore = p_a_scale_grid;
+        ignore = a_scale_grid_desc_am_ak;
+
+        // TODO: Implement 2 LDS version
+        static_assert(false, "Not implemented");
+
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+        // B Scale buffer
+        const auto b_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_scale_grid, b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+
+        const AElementwiseOperation a_element_op{};
+        const BElementwiseOperation b_element_op{};
+        const CElementwiseOperation c_element_op{};
+
+        // divide block work by [M, N]
+        const auto block_2_ctile_map = Block2CTileMap{problem.M, problem.N, 4};
+
+        const auto block_work_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[I0]);
+        const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[I1]);
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_m_id * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_n_id * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<AK0Number, MPerBlock, AK1Number>,
+                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ADataType,
+                                                ADataType,
+                                                decltype(a_grid_desc_ak0_m_ak1),
+                                                decltype(a_block_desc_ak0_m_ak1),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<0, 1, 2>,
+                                                ABlockTransferSrcVectorDim,
+                                                2,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_AK1,
+                                                1,
+                                                1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                BlockwiseGemmPipe::GlobalBufferNum>(
+                a_grid_desc_ak0_m_ak1,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_block_desc_ak0_m_ak1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<BK0Number, NPerBlock, BK1Number>,
+                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                BDataType,
+                                                BDataType,
+                                                decltype(b_grid_desc_bk0_n_bk1),
+                                                decltype(b_block_desc_bk0_n_bk1),
+                                                BBlockTransferSrcAccessOrder,
+                                                Sequence<0, 1, 2>,
+                                                BBlockTransferSrcVectorDim,
+                                                2,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_BK1,
+                                                1,
+                                                1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                BlockwiseGemmPipe::GlobalBufferNum>(
+                b_grid_desc_bk0_n_bk1,
+                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                b_element_op,
+                b_block_desc_bk0_n_bk1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        auto a_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ADataType*>(p_shared_0), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto b_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            bit_cast<BDataType*>(static_cast<char*>(p_shared_0) +
+                                 a_block_space_size_aligned * sizeof(ADataType) / APackedSize),
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        auto a_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ADataType*>(p_shared_1), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto b_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            bit_cast<BDataType*>(bit_cast<char*>(p_shared_1) +
+                                 a_block_space_size_aligned * sizeof(ADataType) / APackedSize),
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        auto a_block_bufs = make_tuple(a_block_buf_ping, a_block_buf_pong);
+        auto b_block_bufs = make_tuple(b_block_buf_ping, b_block_buf_pong);
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1Number, 0, 0);
+
+        // Blockwise GEMM pipeline
+        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
+        auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
+        auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        // B scale
+        static constexpr auto mfma =
+            MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeA, is_single_rate_mfma>{};
+        static constexpr auto KPerXdlops  = mfma.GetKPerXdlops();
+        static constexpr auto K1PerXdlops = mfma.GetK1PerXdlops();
+        static constexpr auto K0PerXdlops = KPerXdlops / K1PerXdlops;
+        static constexpr auto KPerThread  = KPerBlock / K0PerXdlops;
+
+        const index_t ScaleSliceSizeN         = NXdlPerWave;
+        static constexpr auto ScaleSliceSizeK = (KPerThread + ScaleBlockSize - 1) / ScaleBlockSize;
+        static constexpr auto KBlockScaleSliceSizeK =
+            (KPerBlock + ScaleBlockSize - 1) / ScaleBlockSize;
+
+        constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<ScaleSliceSizeN>{}, Number<ScaleSliceSizeK>{}));
+
+        constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        auto b_thread_offset_n =
+            get_thread_local_1d_id() % NPerXdl +
+            (get_thread_local_1d_id() / BlockwiseGemmPipe::WaveSize) % NWaves * NPerXdl;
+        auto b_thread_offset_k =
+            (get_thread_local_1d_id() % BlockwiseGemmPipe::WaveSize) / NPerXdl * KPerThread;
+
+        auto b_scale_thread_copy =
+            ThreadwiseTensorSliceTransfer_v2<BScaleDataType,
+                                             BScaleDataType,
+                                             decltype(b_scale_grid_desc_bn_ak),
+                                             decltype(b_scale_thread_desc),
+                                             Sequence<1, ScaleSliceSizeK>,
+                                             Sequence<0, 1>,
+                                             1,
+                                             ScaleSliceSizeK,
+                                             1,
+                                             false>(
+                b_scale_grid_desc_bn_ak,
+                make_multi_index(block_n_id * NPerBlock + b_thread_offset_n,
+                                 b_thread_offset_k / ScaleBlockSize));
+
+        constexpr auto b_scale_thread_slice_copy_step =
+            make_tuple(make_multi_index(NWaves * NPerXdl, 0),
+                       make_multi_index(-NPerBlock, 0),
+                       make_multi_index(-NPerBlock, KBlockScaleSliceSizeK));
+
+        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+            a_grid_desc_ak0_m_ak1,
+            a_block_desc_ak0_m_ak1,
+            a_blockwise_copy,
+            a_grid_buf,
+            a_block_bufs,
+            a_block_slice_copy_step,
+            b_grid_desc_bk0_n_bk1,
+            b_block_desc_bk0_n_bk1,
+            b_blockwise_copy,
+            b_grid_buf,
+            b_block_bufs,
+            b_block_slice_copy_step,
+            c_thread_buf,
+            b_scale_grid_desc_bn_ak,
+            b_scale_thread_desc,
+            b_scale_thread_copy,
+            b_scale_grid_buf,
+            b_scale_thread_slice_copy_step,
+            num_k_block_main_loop);
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+            constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared_0),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
+                        M1,                                      // M1 = MWave
+                        M2,                                      // M2 * M3 * M4 = MPerXdl
+                        M3,
+                        M4)),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
+                        N1,                                      // N1 = NWave
+                        N2))),                                   // N2 = NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   CShuffleDataType,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+
+            // shuffle: blockwise copy C from LDS to global
+            auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
+                ThisThreadBlock,            // ThreadGroup
+                CElementwiseOperation,      // ElementwiseOperation,
+                CGlobalMemoryDataOperation, // DstInMemOp,
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                CShuffleDataType,     // typename SrcData,
+                CDataType,            // typename DstData,
+                decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                Sequence<0, 1, 2, 3>,                           // typename DimAccessOrder,
+                3,                                              // index_t VectorDim,
+                CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector,
+                true,  // bool ThreadTransferSrcResetCoordinateAfterRun,
+                false> // bool ThreadTransferDstResetCoordinateAfterRun>
+                {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(0, 0, 0, 0),
+                 c_grid_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(block_m_id, 0, block_n_id, 0),
+                 c_element_op};
+
+            // space filling curve for threadwise C in VGPR
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle,
+                                           CShuffleNXdlPerWavePerShuffle,
+                                           1,
+                                           1,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>>{};
+
+            // space filling curve for shuffled blockwise C in global mem
+            constexpr auto sfc_c_global =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!");
+
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf,
+                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                c_shuffle_block_copy_lds_to_global.Run(
+                    c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                    c_shuffle_block_buf,
+                    c_grid_desc_mblock_mperblock_nblock_nperblock,
+                    c_grid_buf);
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
+
+                    // move on C
+                    c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow(
+                        c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
+                }
+            });
+        }
+    }
+
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run_2Lds(const ADataType* p_a_grid,
+                                    const BDataType* p_b_grid,
+                                    const BScaleDataType* p_b_scale_grid,
+                                    CDataType* p_c_grid,
+                                    void* p_shared_0,
+                                    void* p_shared_1,
+                                    const Problem& problem)
+    {
+        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+            problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
+        const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
+            problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(
+            problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
+
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+        const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor(
+            make_tuple(problem.N, math::integer_divide_ceil(problem.K, ScaleBlockSize)),
+            make_tuple(problem.StrideScaleB, 1));
+
+        Run_2Lds<decltype(a_grid_desc_ak0_m_ak1),
+                 decltype(b_grid_desc_bk0_n_bk1),
+                 decltype(b_scale_grid_desc_bn_ak),
+                 decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                 HasMainKBlockLoop,
+                 CGlobalMemoryDataOperation,
+                 TailNum>(p_a_grid,
+                          p_b_grid,
+                          p_b_scale_grid,
+                          p_c_grid,
+                          p_shared_0,
+                          p_shared_1,
+                          problem,
+                          a_grid_desc_ak0_m_ak1,
+                          b_grid_desc_bk0_n_bk1,
+                          b_scale_grid_desc_bn_ak,
+                          c_grid_desc_mblock_mperblock_nblock_nperblock);
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
index 0a62464cc2..b435fd5d5a 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
@@ -491,8 +491,9 @@ struct GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
         // sanity check
         constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
         constexpr bool is_single_rate_mfma =
-            ((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
-             lcm_AK1_BK1 <= 4)
+            (((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
+              lcm_AK1_BK1 <= 4) ||
+             (is_same<FloatAB, int8_t>::value && lcm_AK1_BK1 <= 8))
                 ? true
                 : false;
         constexpr index_t KPack = math::max(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp
index 6a4b1cc14b..ad65e75ef9 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp
@@ -489,8 +489,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_waveletmodel_cshuffle
             // branch early for math wave
             constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
             constexpr bool is_single_rate_mfma =
-                ((is_same<ABDataType, half_t>::value || is_same<ABDataType, bhalf_t>::value) &&
-                 lcm_AK1_BK1 <= 4)
+                (((is_same<ABDataType, half_t>::value || is_same<ABDataType, bhalf_t>::value) &&
+                  lcm_AK1_BK1 <= 4) ||
+                 (is_same<ABDataType, int8_t>::value && lcm_AK1_BK1 <= 8))
                     ? true
                     : false;
             constexpr index_t KPack = math::max(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
index b41e747a3a..168c553180 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
@@ -741,11 +741,17 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
         //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
         //       register
         // sanity check
+        constexpr bool is_single_rate_mfma =
+            (((is_same<FloatAAdjusted, half_t>::value || is_same<FloatAAdjusted, bhalf_t>::value) &&
+              K1 <= 4) ||
+             (is_same<FloatAAdjusted, int8_t>::value && K1 <= 8))
+                ? true
+                : false;
 
-        constexpr index_t KPack =
-            math::max(K1,
-                      MfmaSelector<FloatAAdjusted, MPerXDL, NPerXDL, FloatBAdjusted>::selected_mfma
-                          .k_per_blk);
+        constexpr index_t KPack = math::max(
+            K1,
+            MfmaSelector<FloatAAdjusted, MPerXDL, NPerXDL, FloatBAdjusted, is_single_rate_mfma>::
+                selected_mfma.k_per_blk);
 
         auto blockwise_gemm =
             BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
index 6ee279a3f1..256b495c6e 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -1,9 +1,10 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
index 7db8798695..f433b3c974 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
@@ -448,8 +448,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
         // sanity check
         constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
         constexpr bool is_single_rate_mfma =
-            ((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
-             lcm_AK1_BK1 <= 4)
+            (((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
+              lcm_AK1_BK1 <= 4) ||
+             (is_same<FloatAB, int8_t>::value && lcm_AK1_BK1 <= 8))
                 ? true
                 : false;
         constexpr index_t k_pack = math::max(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
new file mode 100644
index 0000000000..255fb8cff4
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
@@ -0,0 +1,2350 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_gather.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3_scatter.hpp"
+
+#define DEBUG_LOG 0
+
+namespace ck {
+
+// Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same
+// kernel function Blockers:
+// 1. Two separted declaration of __shared__ pointer is the key to make sure data access operate on
+// two lds chunks.
+// 2. Occupied __shared__ won't release until whole shader end, a.k.a AB and C may not use same lds
+// buffer when we declare __shared__ inside blkgemmpipe
+
+enum Activation
+{
+    gelu_and_mul = 0,
+    silu_and_mul = 1
+};
+
+template <typename GridwiseGemm,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Even>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+    // __attribute__((amdgpu_waves_per_eu(1, 1)))
+    kernel_moe_gemm(typename GridwiseGemm::Argument karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+
+    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_sorted_token_ids,
+        karg.p_sorted_expert_ids,
+        karg.p_max_token_id,
+        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+        karg.p_ds_grid,
+        karg.p_c_grid,
+        p_shared,
+        karg,
+        karg.a_element_op,
+        karg.b_element_op,
+        karg.c_element_op);
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx9__))
+}
+
+template <typename GridwiseGemm,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Even>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+    // __attribute__((amdgpu_waves_per_eu(1, 1)))
+    kernel_moe_gemm_2lds(typename GridwiseGemm::Argument karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    __shared__ char p_shared1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+
+    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_sorted_token_ids,
+        karg.p_sorted_expert_ids,
+        karg.p_max_token_id,
+        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+        karg.p_ds_grid,
+        karg.p_c_grid,
+        p_shared,
+        p_shared1,
+        karg,
+        karg.a_element_op,
+        karg.b_element_op,
+        karg.c_element_op);
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx9__))
+}
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          tensor_operation::device::GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1Value,
+          index_t BK1Value,
+          index_t MPerXdl,
+          index_t NPerXdl,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename CDEShuffleBlockTransferScalarPerVectors,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          index_t ActivationOperation                 = 0,
+          bool NSwizzle                               = false,
+          bool IsInputGemm                            = true,
+          bool MulRoutedWeight                        = true,
+          bool PerTokenQuant                          = false,
+          typename IndexType                          = index_t,
+          typename ComputeTypeA                       = CDataType,
+          typename ComputeTypeB                       = ComputeTypeA,
+          typename LDSTypeA                           = ADataType,
+          typename LDSTypeB                           = BDataType>
+struct GridwiseMoeGemm
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    static constexpr auto CShuffleBlockTransferScalarPerVector_NPerBlock =
+        CDEShuffleBlockTransferScalarPerVectors{}[I0];
+    // K1 should be Number<...>
+    static constexpr auto AK0Number       = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0Number       = Number<KPerBlock / BK1Value>{};
+    static constexpr auto AK1Number       = Number<AK1Value>{};
+    static constexpr auto BK1Number       = Number<BK1Value>{};
+    static constexpr auto BlockSizeNumber = Number<BlockSize>{};
+
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    using mfma_selector = MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeB>;
+    static constexpr index_t KPack =
+        math::max(math::lcm(AK1Number, BK1Number), mfma_selector::selected_mfma.k_per_blk);
+    static constexpr index_t KLane =
+        mfma_selector::GetKPerXdlops() / mfma_selector::GetK1PerXdlops();
+    static constexpr index_t KRepeat = KPerBlock / KLane / KPack;
+    static constexpr index_t NLane   = NPerXdl;
+    static constexpr index_t NWave   = NPerBlock / NPerXdl / NXdlPerWave;
+    // static constexpr index_t NumTokens = 1;
+    static constexpr index_t SortedTileSize = MPerBlock;
+
+    static constexpr auto MakeDsGridPointer()
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                return static_cast<const DDataType*>(nullptr);
+            },
+            Number<NumDTensor>{});
+    }
+
+    using DsGridPointer = decltype(MakeDsGridPointer());
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    static constexpr index_t APackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    static constexpr index_t BPackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    __host__ static auto CalculateGridSize(index_t M, index_t N)
+    {
+        const index_t nblock = math::integer_divide_ceil(N, NPerBlock);
+        const index_t mblock = math::integer_divide_ceil(M, MPerBlock);
+        const index_t gridx  = NSwizzle ? nblock * mblock : nblock;
+        const index_t gridy  = NSwizzle ? 1 : mblock;
+
+        return std::make_tuple(gridx, gridy, 1);
+    }
+
+    __host__ __device__ static auto CalculateMPadded(index_t M)
+    {
+        return math::integer_least_multiple(M, MPerBlock);
+    }
+
+    __host__ __device__ static auto CalculateNPadded(index_t N)
+    {
+        return math::integer_least_multiple(N, NPerBlock);
+    }
+
+    __host__ __device__ static auto CalculateBN0Shuffled(index_t N)
+    {
+        return math::integer_divide_ceil(N, NLane);
+    }
+    __host__ __device__ static auto CalculateBK0Shuffled(index_t K)
+    {
+        return math::integer_divide_ceil(K, KLane * KPack);
+    }
+
+    __host__ __device__ static auto CalculateKPadded(index_t K)
+    {
+        return math::integer_divide_ceil(K, KPerBlock) * KPerBlock;
+    }
+
+    __host__ __device__ static auto CalculateAK0Padded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * (KPerBlock / AK1Value);
+    }
+
+    __host__ __device__ static auto CalculateBK0Padded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * (KPerBlock / BK1Value);
+    }
+
+    __host__ __device__ static auto CalculateKPadded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * KPerBlock;
+    }
+
+    __host__ __device__ static auto CalculateKRead(index_t K, index_t K_Batch = 1)
+    {
+        constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
+        auto K_t                = K_Batch * KReadVec;
+        return (K + K_t - 1) / K_t * KReadVec;
+    }
+
+    __host__ __device__ static auto CalculateMBlock(index_t M)
+    {
+        return math::integer_divide_ceil(M, MPerBlock);
+    }
+
+    __host__ __device__ static auto CalculateNBlock(index_t N)
+    {
+        return math::integer_divide_ceil(N, NPerBlock);
+    }
+
+    template <index_t MNXdlPerWave, index_t MNWaves, index_t MNPerXdl, typename TileDesc_K0_MN_K1>
+    __host__ __device__ static constexpr auto MakeGemmMmaTileDescriptor(const TileDesc_K0_MN_K1&)
+    {
+        constexpr index_t K0 = TileDesc_K0_MN_K1{}.GetLength(Number<0>{});
+        constexpr index_t K1 = TileDesc_K0_MN_K1{}.GetLength(Number<2>{});
+
+        return transform_tensor_descriptor(
+            TileDesc_K0_MN_K1{},
+            make_tuple(make_merge_transform_v3_division_mod(make_tuple(Number<K0>{}, Number<K1>{})),
+                       make_unmerge_transform(make_tuple(
+                           Number<MNXdlPerWave>{}, Number<MNWaves>{}, Number<MNPerXdl>{}))),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
+            make_tuple(Sequence<3>{}, Sequence<0, 1, 2>{}));
+    }
+
+    __host__ __device__ static auto MakeAGridDescriptor_AK0_M_AK1(
+        IndexType M, IndexType MPad, IndexType K, IndexType KPad, IndexType StrideA, IndexType AK0)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both M and K
+            const auto a_grid_desc_m_k =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_right_pad_transform(M, MPad - M),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(MPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad M, but not K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_right_pad_transform(M, MPad - M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad K, but not M
+            const auto a_grid_desc_m_k = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else
+        {
+            // not pad M or K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+    }
+
+    __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
+    {
+        constexpr index_t NkSwizzleNumber = Number<warpSize * KPack>{};
+        return make_naive_tensor_descriptor(
+            make_tuple(N0 / NWave, NWave, K0, NkSwizzleNumber),
+            make_tuple(NWave * K0 * NkSwizzleNumber, K0 * NkSwizzleNumber, NkSwizzleNumber, I1));
+    }
+
+    __host__ __device__ static auto MakeBGridDescriptor_BK0_N_BK1(
+        index_t K, index_t KPad, index_t N, index_t NPad, index_t StrideB, index_t BK0)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(StrideB, I1));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        static_assert(!(is_same_v<remove_cvref_t<ADataType>, pk_i4_t> &&
+                        GemmSpec != GemmSpecialization::Default),
+                      "pk_i4_t does not support padding");
+
+        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both N and K
+            const auto b_grid_desc_n_k =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_right_pad_transform(N, NPad - N),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(NPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad N, but not K
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad K, but not N
+            const auto b_grid_desc_n_k = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_pass_through_transform(N), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else
+        {
+            // not pad N or K
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+    }
+
+    template <typename ABlockDesc_AK0_M_AK1>
+    __host__ __device__ static constexpr auto
+    MakeAMmaTileDescriptor_M0_M1_M2_K(const ABlockDesc_AK0_M_AK1&)
+    {
+        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
+
+        return MakeGemmMmaTileDescriptor<MXdlPerWave, MWaves, MPerXdl>(ABlockDesc_AK0_M_AK1{});
+    }
+
+    template <typename BBlockDesc_BK0_N_BK1>
+    __host__ __device__ static constexpr auto
+    MakeBMmaTileDescriptor_N0_N1_N2_K(const BBlockDesc_BK0_N_BK1&)
+    {
+        return MakeGemmMmaTileDescriptor<NXdlPerWave, NWave, NPerXdl>(BBlockDesc_BK0_N_BK1{});
+    }
+
+    template <typename ELayout>
+    __host__ __device__ static auto MakeCGridDescriptor_M_N(
+        IndexType M, IndexType MPad, IndexType N, IndexType NPad, IndexType StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ELayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ELayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        // pad M and N
+        return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                           make_tuple(make_right_pad_transform(M, MPad - M),
+                                                      make_right_pad_transform(N, NPad - N)),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}));
+    }
+
+    template <typename DLayout>
+    __host__ __device__ static auto
+    MakeDGridDescriptor_M_N(index_t M, index_t MPad, index_t N, index_t NPad, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, DLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I0));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, DLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I0, StrideC));
+            }
+        }();
+
+        // pad M and N
+        return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                           make_tuple(make_right_pad_transform(M, MPad - M),
+                                                      make_right_pad_transform(N, NPad - N)),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}));
+    }
+
+    __host__ __device__ static auto MakeDsGridDescriptor_M_N(
+        index_t M, index_t MPad, index_t N, index_t NPad, std::array<index_t, NumDTensor> StrideDs)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                return MakeDGridDescriptor_M_N<DLayout>(M, MPad, N, NPad, StrideDs[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    template <typename DsGridDesc>
+    __device__ static constexpr auto MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        const DsGridDesc& ds_grid_desc_m_n, index_t MBlock, index_t NBlock)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                return MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    ds_grid_desc_m_n[i], MBlock, NBlock);
+            },
+            Number<NumDTensor>{});
+    }
+
+    struct Problem
+    {
+        __host__ __device__ Problem(index_t NumTokens_,
+                                    index_t TopK_,
+                                    index_t M_,
+                                    index_t N_,
+                                    index_t K_,
+                                    index_t StrideA_,
+                                    index_t StrideB_,
+                                    std::array<index_t, NumDTensor> StrideDs_,
+                                    index_t StrideC_,
+                                    index_t KBatch_)
+            : NumTokens{NumTokens_},
+              TopK{TopK_},
+              M{M_},
+              N{N_},
+              K{K_},
+              StrideA{StrideA_},
+              StrideB{StrideB_},
+              StrideDs{StrideDs_},
+              StrideC{StrideC_},
+              KBatch{KBatch_},
+              MPadded{CalculateMPadded(M_)},
+              NPadded{CalculateNPadded(N_)},
+              KRead{CalculateKRead(K_, KBatch_)},
+              KPadded{CalculateKPadded(K_, KBatch_)},
+              AK0{CalculateAK0Padded(K_, KBatch_)},
+              BK0{CalculateBK0Padded(K_, KBatch_)},
+              MBlock{CalculateMBlock(M_)},
+              NBlock{CalculateNBlock(N_)},
+              BN0Shuffled{CalculateBN0Shuffled(N_)},
+              BK0Shuffled{CalculateBK0Shuffled(K_)}
+        {
+        }
+
+        __host__ void Print() const
+        {
+            std::cout << "problem {"
+                      << "NumTokens:" << NumTokens << ", "
+                      << "TopK:" << TopK << ", "
+                      << "M:" << M << ", "
+                      << "N:" << N << ", "
+                      << "K:" << K << ", "
+                      << "SA:" << StrideA << ", "
+                      << "SB:" << StrideB << ", "
+                      << "SC:" << StrideC << ", "
+                      << "MP:" << MPadded << ", "
+                      << "NP:" << NPadded << ", "
+                      << "KRead:" << KRead << ", "
+                      << "KP:" << KPadded << ", "
+                      << "AK0:" << AK0 << ", "
+                      << "BK0:" << BK0 << ", "
+                      << "MBlock: " << MBlock << ", "
+                      << "NBlock: " << NBlock << "}" << std::endl;
+        }
+
+        index_t NumTokens;
+        index_t TopK;
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t StrideA;
+        index_t StrideB;
+        std::array<index_t, NumDTensor> StrideDs;
+        index_t StrideC;
+        index_t KBatch;
+        index_t MPadded;
+        index_t NPadded;
+        index_t KRead;
+        index_t KPadded;
+        index_t AK0;
+        index_t BK0;
+        index_t MBlock;
+        index_t NBlock;
+        // FOR PRESHUFFLE ONLY
+        index_t BN0Shuffled;
+        index_t BK0Shuffled;
+    };
+
+    // Argument
+    struct Argument : public tensor_operation::device::BaseArgument, public Problem
+    {
+        __host__ Argument(const index_t* p_sorted_token_ids_,
+                          const index_t* p_sorted_expert_ids_,
+                          const index_t* p_max_token_id_,
+                          const ADataType* p_a_grid_,
+                          const BDataType* p_b_grid_,
+                          std::array<const void*, NumDTensor> p_ds_grid_,
+                          CDataType* p_c_grid_,
+                          index_t NumTokens_,
+                          index_t TopK_,
+                          index_t M_,
+                          index_t N_,
+                          index_t K_,
+                          index_t StrideA_,
+                          index_t StrideB_,
+                          std::array<index_t, NumDTensor> StrideDs_,
+                          index_t StrideC_,
+                          index_t k_batch_,
+                          AElementwiseOperation a_element_op_,
+                          BElementwiseOperation b_element_op_,
+                          CElementwiseOperation c_element_op_)
+            : Problem{NumTokens_,
+                      TopK_,
+                      M_,
+                      N_,
+                      K_,
+                      StrideA_,
+                      StrideB_,
+                      StrideDs_,
+                      StrideC_,
+                      k_batch_},
+              p_sorted_token_ids{p_sorted_token_ids_},
+              p_sorted_expert_ids{p_sorted_expert_ids_},
+              p_max_token_id{p_max_token_id_},
+              p_a_grid{p_a_grid_},
+              p_b_grid{p_b_grid_},
+              p_ds_grid{},
+              p_c_grid{p_c_grid_},
+              a_element_op{a_element_op_},
+              b_element_op{b_element_op_},
+              c_element_op{c_element_op_}
+        {
+
+            // populate pointer, desc for Ds
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DDataType_ = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                // D pointer
+                p_ds_grid(i) = static_cast<const DDataType_*>(p_ds_grid_[i]);
+            });
+        }
+
+        const index_t* p_sorted_token_ids;
+        const index_t* p_sorted_expert_ids;
+        const index_t* p_max_token_id;
+        const ADataType* p_a_grid;
+        const BDataType* p_b_grid;
+        DsGridPointer p_ds_grid;
+        CDataType* p_c_grid;
+
+        const AElementwiseOperation a_element_op;
+        const BElementwiseOperation b_element_op;
+        const CElementwiseOperation c_element_op;
+    };
+
+    struct SplitKBatchOffset
+    {
+        __device__ SplitKBatchOffset(Argument& karg, index_t k_id)
+        {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                a_k_split_offset = k_id * karg.KRead / APackedSize;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                a_k_split_offset = k_id * karg.KRead * karg.StrideA;
+            }
+
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
+            {
+                b_k_split_offset = k_id * karg.KRead * karg.StrideB;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
+            {
+                // KPack * NLane * KLane * K0 * N0
+                b_k_split_offset = k_id * karg.KRead * NLane / BPackedSize;
+            }
+
+            if(k_id < karg.KBatch - 1)
+            {
+                karg.K = karg.KRead;
+            }
+            else
+            {
+                karg.K = karg.K - karg.KRead * (karg.KBatch - 1);
+            }
+        }
+
+        index_t a_k_split_offset;
+        index_t b_k_split_offset;
+    };
+
+    __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    {
+        // A matrix in LDS memory, dst of blockwise copy
+        if constexpr(ABlockLdsExtraM)
+        {
+            return make_naive_tensor_descriptor(
+                make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
+                make_tuple(AK1Number, Number<KPerBlock + ABlockLdsExtraM>{}, I1));
+        }
+        // xor tensor transformation request more unnecessary vgpr usage, would cause register spill
+        // in some cases.
+        else if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            constexpr auto a_lds_block_desc =
+                make_naive_tensor_descriptor(make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
+                                             make_tuple(AK1Number, Number<KPerBlock>{}, I1));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(make_xor_with_modulo_transform(
+                               make_tuple(Number<MPerBlock>{}, Number<AK0Number>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
+
+            return a_lds_block_desc_permuted;
+        }
+        else // ColumnMajor A
+        {
+            // kfold and mpair dimension is not always required.
+            // more dimension in merge_transform increase the difficulty of generating immarg offset
+            // for compiler.
+            constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+            constexpr auto M1 = MPerBlock / M0;
+
+            constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
+            constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
+            constexpr auto KThreadRead      = 64 / MPerXdl;
+            constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
+
+            constexpr auto kfold = (AK1Number * M0 * sizeof(LDSTypeA) > 128)
+                                       ? 1
+                                       : 128 / (AK1Number * M0 * sizeof(LDSTypeA));
+            constexpr auto KThreadReadPerm =
+                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
+                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
+                    : KThreadRead;
+
+            // 1<=mpair<=n0
+            constexpr auto mpair = (AK1Number * MPerXdl * sizeof(LDSTypeA) > 128)
+                                       ? 1
+                                       : ((128 / (AK1Number * MPerXdl * sizeof(LDSTypeA))) > M0
+                                              ? M0
+                                              : 128 / (AK1Number * MPerXdl * sizeof(LDSTypeA)));
+
+            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor_packed(
+                make_tuple(Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                           Number<K0PerThreadWrite>{},
+                           Number<KThreadReadPerm * M1>{},
+                           Number<kfold * M0 / mpair>{},
+                           Number<mpair>{},
+                           AK1Number));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_xor_with_modulo_transform(
+                        make_tuple(Number<KThreadReadPerm * M1>{}, Number<kfold * M0 / mpair>{})),
+                    make_pass_through_transform(Number<mpair>{}),
+                    make_pass_through_transform(AK1Number)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}));
+
+            constexpr auto a_lds_block_desc_unmerged = transform_tensor_descriptor(
+                a_lds_block_desc_permuted,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_unmerge_transform(make_tuple(Number<KThreadReadPerm>{}, Number<M1>{})),
+                    make_unmerge_transform(make_tuple(Number<kfold>{}, Number<M0 / mpair>{})),
+                    make_pass_through_transform(Number<mpair>{}),
+                    make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0>{},
+                           Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<5>{}),
+                make_tuple(Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<0, 3>{},
+                           Sequence<4, 5>{},
+                           Sequence<6>{},
+                           Sequence<7>{}));
+
+            constexpr auto a_lds_block_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_lds_block_desc_unmerged,
+                make_tuple(make_merge_transform_v3_division_mod(
+                               make_tuple(Number<KThreadReadPerm>{},
+                                          Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                                          Number<kfold>{},
+                                          Number<K0PerThreadWrite>{})),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<M0 / mpair>{}, Number<mpair>{}, Number<M1>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0, 1, 4, 2>{}, Sequence<5, 6, 3>{}, Sequence<7>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return a_lds_block_desc_ak0_m_ak1;
+        }
+    }
+
+    __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // K0 -> N0/NWave -> NWave -> KLane -> NLane -> KPack
+        return make_naive_tensor_descriptor_packed(
+            make_tuple(Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}));
+    }
+
+    __device__ static constexpr auto GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
+    {
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            make_naive_tensor_descriptor_packed(
+                make_tuple(I1,
+                           Number<CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl>{},
+                           I1,
+                           Number<CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>{}));
+
+        return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    using BlockwiseGemmPipe =
+        remove_cvref_t<decltype(BlockGemmBPreshufflePipeline_Selector<
+                                BlkGemmPipelineVer,
+                                BlkGemmPipeSched,
+                                BlockSize,
+                                ADataType,
+                                BDataType,
+                                ComputeTypeA,
+                                AccDataType,
+                                decltype(GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()),
+                                decltype(GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()),
+                                decltype(MakeAMmaTileDescriptor_M0_M1_M2_K(
+                                    GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1())),
+                                decltype(MakeBMmaTileDescriptor_N0_N1_N2_K(
+                                    GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1())),
+                                ABlockTransferSrcScalarPerVector,
+                                BBlockTransferSrcScalarPerVector,
+                                MPerBlock,
+                                NPerBlock,
+                                KPerBlock,
+                                MPerXdl,
+                                NPerXdl,
+                                MXdlPerWave,
+                                NXdlPerWave,
+                                KPack,
+                                IsInputGemm>())>;
+
+    __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
+
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        // LDS allocation for C shuffle in LDS
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+        constexpr auto c_block_size =
+            c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
+
+        return math::max(a_block_space_size_aligned * sizeof(LDSTypeA) / APackedSize,
+                         c_block_size * sizeof(CShuffleDataType));
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    __host__ static constexpr bool CheckValidity(const Argument& karg)
+    {
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
+                      "Invalid tuning param!");
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     !(is_same<tensor_layout::gemm::RowMajor, ALayout>::value))
+        {
+            if(!(karg.M % MPerBlock == 0))
+            {
+#if DEBUG_LOG
+                std::cout << "Arg M value is not a multiple of MPerBlock! M: " << karg.M << " "
+                          << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                          << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     (is_same<tensor_layout::gemm::RowMajor, BLayout>::value))
+        {
+            if(!(karg.N % NPerBlock == 0))
+            {
+#if DEBUG_LOG
+                std::cout << "Arg N value is not a multiple of NPerBlock! N: " << karg.N << " "
+                          << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                          << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::KPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+
+            auto K_t = karg.KBatch * KPerBlock;
+            if(!(karg.K % K_t == 0))
+            {
+#if DEBUG_LOG
+                std::cout << "Arg K value is not a multiple of K_Batch * K0PerBlock * K1! K: "
+                          << karg.K << " " << __FILE__ << ":" << __LINE__
+                          << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+        else
+        {
+            constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
+            auto K_t                = karg.KBatch * KReadVec;
+            auto KReadPadSplited    = math::integer_divide_ceil(karg.K, K_t) * KReadVec;
+            if((KReadPadSplited * (karg.KBatch - 1)) >= karg.K)
+            {
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            if(karg.K % ABlockTransferSrcScalarPerVector != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg K (" << karg.K
+                          << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                          << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % ABlockTransferSrcScalarPerVector != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg M (" << karg.M
+                          << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                          << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+        {
+            if(karg.N % BBlockTransferSrcScalarPerVector != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg N (" << karg.N
+                          << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                          << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.K % BBlockTransferSrcScalarPerVector != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg K (" << karg.K
+                          << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                          << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+        {
+            if(karg.N % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg N (" << karg.N
+                          << ") value is not a multiple of "
+                             "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                          << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! " << __FILE__
+                          << ":" << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg M (" << karg.M
+                          << ") value is not a multiple of "
+                             "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                          << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! " << __FILE__
+                          << ":" << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+
+        // check gridwise gemm pipeline
+#if 1
+        const auto num_k_loop = karg.AK0 / (KPerBlock / AK1Value);
+
+        if(num_k_loop <= BlockwiseGemmPipe::PrefetchStages)
+        {
+            return false;
+        }
+#endif
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return BlockwiseGemmPipe::BlockHasHotloop(num_loop);
+    }
+
+    __host__ __device__ static constexpr TailNumber CalculateKBlockLoopTailNum(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return BlockwiseGemmPipe::BlockLoopTailNum(num_loop);
+    }
+
+    template <typename CGridDesc>
+    __device__ static constexpr auto MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        const CGridDesc& c_grid_desc_m_n, index_t MBlock, index_t NBlock)
+    {
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+
+        return c_grid_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    // if arch = gfx942
+    // using Block2CTileMapDefault = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock,
+    // NPerBlock>;
+
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run(const index_t* p_sorted_token_ids,
+                               const index_t* p_sorted_expert_ids,
+                               const index_t* p_max_token_id,
+                               const ADataType* p_a_grid,
+                               const BDataType* p_b_grid,
+                               DsGridPointer& p_ds_grid,
+                               CDataType* p_c_grid,
+                               void* p_shared,
+                               const Problem& problem,
+                               AElementwiseOperation a_element_op,
+                               BElementwiseOperation b_element_op,
+                               CElementwiseOperation c_element_op)
+    {
+        ignore                           = b_element_op;
+        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+            IsInputGemm ? problem.NumTokens : problem.NumTokens * problem.TopK,
+            problem.MPadded,
+            problem.K,
+            problem.KPadded,
+            problem.StrideA,
+            problem.AK0);
+        const auto b_grid_desc_bpreshuffled =
+            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
+            IsInputGemm ? problem.NumTokens * problem.TopK : problem.NumTokens,
+            problem.MPadded,
+            problem.N,
+            problem.NPadded,
+            problem.StrideC);
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+        const index_t max_token_id = __builtin_amdgcn_readfirstlane(p_max_token_id[0]);
+        // static_assert(NSwizzle == false, "to do fix: need another pr in sorting merged");
+        const index_t expert_block_id = NSwizzle ? blockIdx.x / problem.NBlock : blockIdx.y;
+        if(expert_block_id * MPerBlock >= max_token_id)
+            return;
+        const index_t expert_id =
+            __builtin_amdgcn_readfirstlane(p_sorted_expert_ids[expert_block_id]);
+        const auto block_mn = [&]() -> std::pair<int, int> {
+            if constexpr(NSwizzle)
+            {
+                const index_t ecnt_prefix  = p_max_token_id[1 + expert_id];
+                const index_t prefix_block = ecnt_prefix * problem.NBlock;
+                const index_t ecnt         = p_max_token_id[2 + expert_id] - ecnt_prefix;
+                const index_t expert_swizzle =
+                    ecnt > 0 ? ecnt : 1; // p_max_token_id[expert_id + 1]; // 2
+                const index_t bid_new = blockIdx.x - prefix_block;
+                const index_t nid     = __builtin_amdgcn_readfirstlane(
+                    bid_new % 8 + bid_new / (8 * expert_swizzle) * 8);
+                const index_t mid =
+                    __builtin_amdgcn_readfirstlane(ecnt_prefix + bid_new / 8 % expert_swizzle);
+                return {nid, mid};
+            }
+            else
+            {
+                return {blockIdx.x, blockIdx.y};
+            }
+        }();
+
+        const index_t block_n_id = block_mn.first;
+        const index_t block_m_id = block_mn.second;
+        const index_t token0 =
+            __builtin_amdgcn_readfirstlane(p_sorted_token_ids[block_m_id * MPerBlock] & 0xffffff);
+
+        // constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+        constexpr auto AMThreads  = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+        constexpr auto AK0Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
+        constexpr auto AK1Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I2);
+        constexpr auto AKThreads  = AK0Threads * AK1Threads;
+        constexpr auto AMRepeats  = MPerBlock / AMThreads;
+        const index_t token_pos   = block_m_id * MPerBlock + threadIdx.x / AKThreads * AMRepeats;
+
+        if(token_pos >= max_token_id || token0 >= problem.NumTokens)
+            return;
+        StaticallyIndexedArray<IndexType, AMRepeats> gather_offsets;
+        static_for<0, AMRepeats, 1>{}([&](auto m0) {
+            const index_t fused_token = p_sorted_token_ids[token_pos + m0];
+            index_t token_offset      = fused_token & 0xffffff;
+            if constexpr(!IsInputGemm)
+            {
+                token_offset = token_offset * problem.TopK + (fused_token >> 24);
+            }
+            gather_offsets(m0) = static_cast<IndexType>(token_offset) * problem.K;
+        });
+        const index_t expert_stride =
+            __builtin_amdgcn_readfirstlane(problem.N * problem.K * (IsInputGemm ? 2 : 1));
+
+        // N0, K0, Blocksize*KPack
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_n_id * NXdlPerWave);
+
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid + expert_id * expert_stride / BPackedSize,
+            b_grid_desc_bpreshuffled.GetElementSpaceSize());
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        // dummy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+        // A matrix blockwise copy
+        auto a_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1_gather<
+            ThisThreadBlock,
+            AElementwiseOperation,
+            ck::tensor_operation::element_wise::PassThrough,
+            InMemoryDataOperationEnum::Set,
+            Sequence<AK0Number, MPerBlock, AK1Number>,
+            ABlockTransferThreadClusterLengths_AK0_M_AK1,
+            ABlockTransferThreadClusterArrangeOrder,
+            ADataType,
+            LDSTypeA,
+            decltype(a_grid_desc_ak0_m_ak1),
+            decltype(a_block_desc_ak0_m_ak1),
+            ABlockTransferSrcAccessOrder,
+            Sequence<0, 1, 2>,
+            ABlockTransferSrcVectorDim,
+            2,
+            ABlockTransferSrcScalarPerVector,
+            ABlockTransferDstScalarPerVector_AK1,
+            1,
+            1,
+            AThreadTransferSrcResetCoordinateAfterRun,
+            true,
+            IndexType,
+            1,
+            BlockwiseGemmPipe::GlobalBufferNum>(a_grid_desc_ak0_m_ak1,
+                                                make_multi_index(0, 0, 0),
+                                                a_element_op,
+                                                a_block_desc_ak0_m_ak1,
+                                                make_multi_index(0, 0, 0),
+                                                ck::tensor_operation::element_wise::PassThrough{},
+                                                gather_offsets);
+
+        // Thread-wise copy
+        // K0 -> N0/NWave -> NWave -> KLane -> NLane -> KPack
+        auto b_block_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        auto b_blockwise_copy = ThreadwiseTensorSliceTransfer_v2<
+            BDataType,
+            BDataType,
+            decltype(b_grid_desc_bpreshuffled),
+            decltype(b_block_desc_bk0_n_bk1),
+            Sequence<Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}>,
+            Sequence<1, 2, 0, 3>,
+            3,
+            BBlockTransferSrcScalarPerVector,
+            BThreadTransferSrcResetCoordinateAfterRun,
+            true>(b_grid_desc_bpreshuffled,
+                  make_multi_index(n_block_data_idx_on_grid,
+                                   get_warp_local_1d_id() % NWave,
+                                   0,
+                                   KPack * (get_thread_local_1d_id() % warpSize)));
+
+        // LDS allocation for A and B: be careful of alignment
+        // Cast after lds
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<LDSTypeA*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(0, 0, KRepeat, 0);
+
+        // Blockwise GEMM pipeline
+        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
+        auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
+        auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+        decltype(c_thread_buf) c_thread_buf_up;
+
+        StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                                  float,
+                                  c_thread_buf.num_of_v_,
+                                  c_thread_buf.s_per_v,
+                                  true>
+            c_thread_buf_fp32;
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+        if constexpr(IsInputGemm)
+        {
+            const BDataType* p_b_grid_up = p_b_grid + expert_stride / 2 / BPackedSize;
+            const auto b_grid_buf_up     = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_b_grid_up + expert_id * expert_stride / BPackedSize,
+                b_grid_desc_bpreshuffled.GetElementSpaceSize());
+            auto b_blockwise_copy_up = ThreadwiseTensorSliceTransfer_v2<
+                BDataType,
+                BDataType,
+                decltype(b_grid_desc_bpreshuffled),
+                decltype(b_block_desc_bk0_n_bk1),
+                Sequence<Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}>,
+                Sequence<1, 2, 0, 3>,
+                3,
+                BBlockTransferSrcScalarPerVector,
+                BThreadTransferSrcResetCoordinateAfterRun,
+                true>(b_grid_desc_bpreshuffled,
+                      make_multi_index(n_block_data_idx_on_grid,
+                                       get_warp_local_1d_id() % NWave,
+                                       0,
+                                       KPack * (get_thread_local_1d_id() % warpSize)));
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+                a_grid_desc_ak0_m_ak1,
+                a_block_desc_ak0_m_ak1,
+                a_blockwise_copy,
+                a_grid_buf,
+                a_block_buf,
+                a_block_slice_copy_step,
+                b_grid_desc_bpreshuffled,
+                b_blockwise_copy,
+                b_blockwise_copy_up,
+                b_grid_buf,
+                b_grid_buf_up,
+                b_block_buf,
+                b_block_slice_copy_step,
+                c_thread_buf,
+                c_thread_buf_up,
+                num_k_block_main_loop);
+        }
+        else
+        {
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+                a_grid_desc_ak0_m_ak1,
+                a_block_desc_ak0_m_ak1,
+                a_blockwise_copy,
+                a_grid_buf,
+                a_block_buf,
+                a_block_slice_copy_step,
+                b_grid_desc_bpreshuffled,
+                b_blockwise_copy,
+                b_grid_buf,
+                b_block_buf,
+                b_block_slice_copy_step,
+                c_thread_buf,
+                num_k_block_main_loop);
+        }
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+
+            // mul scales
+            const float* p_sorted_weights_0 = p_ds_grid[I0];
+            const float* p_scale_b          = p_ds_grid[I1];
+
+            static_assert(M0 * M1 * M2 * M3 * M4 == MPerBlock);
+            static_assert(M4 == 4);
+            const index_t m1 = get_warp_local_1d_id() / NWave;
+            const index_t m3 = threadIdx.x % get_warp_size() / MPerXdl;
+
+            if(p_sorted_weights_0 != nullptr && p_scale_b != nullptr)
+            {
+                if constexpr(PerTokenQuant)
+                {
+                    constexpr index_t scale_stride = (IsInputGemm ? 2 : 1);
+                    p_scale_b += expert_id * problem.N * scale_stride + block_n_id * NPerBlock +
+                                 get_warp_local_1d_id() % NWave * NPerXdl + threadIdx.x % NPerXdl;
+                }
+                else
+                {
+                    p_scale_b += expert_id;
+                }
+
+                vector_type<int32_t, 4> scale_token_ids;
+                vector_type<float, 4> topk_weights;
+                static_for<0, NXdlPerWave, 1>{}([&](auto n0) {
+                    const float scale_b = p_scale_b[n0 * NWave * NPerXdl * PerTokenQuant];
+                    static_for<0, MXdlPerWave, 1>{}([&](auto m0) { // MXDLPerWave
+                        static_for<0, M2, 1>{}([&](auto m2) {      // m_inst_num_groups_per_blk
+                            const index_t m_pos = block_m_id * MPerBlock + m0 * M1 * M2 * M3 * M4 +
+                                                  m1 * M2 * M3 * M4 + m2 * M3 * M4 + m3 * M4;
+                            if constexpr(PerTokenQuant)
+                            {
+                                scale_token_ids =
+                                    *c_style_pointer_cast<const vector_type<int32_t, M4>*>(
+                                        p_sorted_token_ids + m_pos);
+                            }
+                            if constexpr(MulRoutedWeight)
+                            {
+                                topk_weights = *c_style_pointer_cast<const vector_type<float, M4>*>(
+                                    p_ds_grid[I2] + m_pos);
+                            }
+                            static_for<0, M4, 1>{}([&](auto m4) { // m_inst_group_size
+                                float scale_a = [&]() {
+                                    if constexpr(PerTokenQuant)
+                                    {
+                                        index_t fused_token = scale_token_ids.AsType<index_t>()[m4];
+                                        const index_t token_offset = fused_token & 0xffffff;
+                                        return token_offset < problem.NumTokens
+                                                   ? p_sorted_weights_0[token_offset]
+                                                   : 0.0;
+                                    }
+                                    else
+                                    {
+                                        return p_sorted_weights_0[0];
+                                    }
+                                }();
+                                constexpr index_t c_offset =
+                                    blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
+                                        make_tuple(m0, n0, m2 * M4 + m4));
+                                constexpr auto cidx = Number<c_offset>{};
+                                if constexpr(IsInputGemm) // gu fusion
+                                {
+                                    if constexpr(ActivationOperation == Activation::silu_and_mul)
+                                    {
+                                        const float scale_up =
+                                            p_scale_b[(n0 * NWave * NPerXdl + problem.N) *
+                                                      PerTokenQuant];
+                                        float gate = scale_a * scale_b * c_thread_buf[cidx];
+                                        float up   = scale_a * scale_up * c_thread_buf_up[cidx];
+                                        if constexpr(MulRoutedWeight)
+                                        {
+                                            gate = gate * topk_weights.AsType<float>()[m4];
+                                            up   = up * topk_weights.AsType<float>()[m4];
+                                        }
+                                        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+                                        {
+                                            gate *= 16;
+                                            up *= 16;
+                                        }
+                                        tensor_operation::element_wise::Silu{}(gate, gate);
+                                        c_thread_buf_fp32(cidx) = gate * up;
+                                    }
+                                    else if(ActivationOperation == Activation::gelu_and_mul)
+                                    {
+                                        const float scale_up =
+                                            p_scale_b[(n0 * NWave * NPerXdl + problem.N) *
+                                                      PerTokenQuant];
+                                        float gate = scale_a * scale_b * c_thread_buf[cidx];
+                                        float up   = scale_a * scale_up * c_thread_buf_up[cidx];
+                                        if constexpr(MulRoutedWeight)
+                                        {
+                                            gate = gate * topk_weights.AsType<float>()[m4];
+                                            up   = up * topk_weights.AsType<float>()[m4];
+                                        }
+                                        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+                                        {
+                                            gate *= 16;
+                                            up *= 16;
+                                        }
+                                        tensor_operation::element_wise::Gelu{}(gate, gate);
+                                        c_thread_buf_fp32(cidx) = gate * up;
+                                    }
+                                }
+                                else
+                                {
+                                    c_thread_buf_fp32(cidx) =
+                                        scale_a * scale_b * c_thread_buf[cidx];
+                                    if constexpr(MulRoutedWeight)
+                                    {
+                                        c_thread_buf_fp32(cidx) = c_thread_buf_fp32(cidx) *
+                                                                  topk_weights.AsType<float>()[m4];
+                                    }
+                                }
+                            });
+                        });
+                    });
+                });
+            }
+            else
+            {
+                vector_type<float, 4> topk_weights; // for gemm2 only
+                static_for<0, NXdlPerWave, 1>{}([&](auto n0) {
+                    static_for<0, MXdlPerWave, 1>{}([&](auto m0) { // MXDLPerWave
+                        static_for<0, M2, 1>{}([&](auto m2) {      // m_inst_num_groups_per_blk
+                            const index_t m_pos = block_m_id * MPerBlock + m0 * M1 * M2 * M3 * M4 +
+                                                  m1 * M2 * M3 * M4 + m2 * M3 * M4 + m3 * M4;
+                            if constexpr(MulRoutedWeight)
+                            {
+                                topk_weights = *c_style_pointer_cast<const vector_type<float, M4>*>(
+                                    p_ds_grid[I2] + m_pos);
+                            }
+                            static_for<0, M4, 1>{}([&](auto m4) { // m_inst_group_size
+                                constexpr index_t c_offset =
+                                    blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
+                                        make_tuple(m0, n0, m2 * M4 + m4));
+                                constexpr auto cidx = Number<c_offset>{};
+
+                                if constexpr(IsInputGemm) // gu fusion
+                                {
+                                    if constexpr(ActivationOperation == Activation::silu_and_mul)
+                                    {
+                                        float gate = c_thread_buf[cidx];
+                                        float up   = c_thread_buf_up[cidx];
+                                        if constexpr(MulRoutedWeight)
+                                        {
+                                            gate = gate * topk_weights.AsType<float>()[m4];
+                                            up   = up * topk_weights.AsType<float>()[m4];
+                                        }
+                                        tensor_operation::element_wise::Silu{}(gate, gate);
+                                        c_thread_buf_fp32(cidx) = gate * up;
+                                    }
+                                    else if(ActivationOperation == Activation::gelu_and_mul)
+                                    {
+                                        float gate = c_thread_buf[cidx];
+                                        float up   = c_thread_buf_up[cidx];
+                                        if constexpr(MulRoutedWeight)
+                                        {
+                                            gate = gate * topk_weights.AsType<float>()[m4];
+                                            up   = up * topk_weights.AsType<float>()[m4];
+                                        }
+                                        tensor_operation::element_wise::Gelu{}(gate, gate);
+                                        c_thread_buf_fp32(cidx) = gate * up;
+                                    }
+                                }
+                                else
+                                {
+                                    c_thread_buf_fp32(cidx) = c_thread_buf[cidx];
+                                    if constexpr(MulRoutedWeight)
+                                    {
+                                        c_thread_buf_fp32(cidx) = topk_weights.AsType<float>()[m4] *
+                                                                  c_thread_buf_fp32[cidx];
+                                    }
+                                }
+                            });
+                        });
+                    });
+                });
+            }
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
+                        M1,                                      // M1 = MWave
+                        M2,                                      // M2 * M3 * M4 = MPerXdl
+                        M3,
+                        M4)),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
+                        N1,                                      // N1 = NWave
+                        N2))),                                   // N2 = NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   CShuffleDataType,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+
+            using EDataType = CDataType;
+
+            const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N(
+                problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs);
+
+            const auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
+                MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    ds_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+            const auto ds_grid_buf = generate_tuple(
+                [&](auto i) {
+                    return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                        p_ds_grid[i], ds_grid_desc_m_n[i].GetElementSpaceSize());
+                },
+                Number<NumDTensor>{});
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_desc_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                    Number<NumDTensor>{}));
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_buf_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_buf),
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_buf[i]; },
+                    Number<NumDTensor>{}));
+
+            // tuple of starting index of C/Ds blockwise copy
+            const auto idx_c_ds_block_begin =
+                container_concat(make_tuple(make_multi_index(0, 0, 0, 0)),
+                                 generate_tuple(
+                                     [&](auto) {
+                                         return make_multi_index(block_m_id, 0, block_n_id, 0);
+                                         // return make_multi_index(block_work_idx[I0], 0,
+                                         // block_work_idx[I1], 0);
+                                     },
+                                     Number<NumDTensor>{}));
+
+            const auto e_grid_desc_mblock_mperblock_nblock_nperblock =
+                c_grid_desc_mblock_mperblock_nblock_nperblock;
+
+            using CDEBlockTransferCluster =
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
+            const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation;
+            constexpr index_t scatter_weight_idx  = 3; // hack fix felix
+            auto cde_block_copy_lds_and_global    = ThreadGroupTensorSliceTransfer_v7r3_scatter<
+                ThisThreadBlock,
+                decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
+                Tuple<EDataType>,
+                decltype(c_ds_desc_refs),
+                decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
+                CElementwiseOperation,
+                Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make Sequence
+                                                                            // support arbitray type
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                CDEBlockTransferCluster,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
+                Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
+                3,                    // index_t SrcVectorDim,
+                3,                    // index_t DstVectorDim,
+                CDEShuffleBlockTransferScalarPerVectors,
+                CShuffleBlockTransferScalarPerVector_NPerBlock,
+                sequence_merge_t<
+                    Sequence<true>,
+                    uniform_sequence_gen_t<NumDTensor,
+                                           false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
+                Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
+                IndexType,
+                1,                 // ScatterDim
+                true,              // OutputScatter: false, only use scatter weights
+                scatter_weight_idx // ScatterWeightIdx: ascale
+                >{c_ds_desc_refs,
+                  idx_c_ds_block_begin,
+                  tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                  make_tuple(make_multi_index(0, 0, block_n_id, 0)),
+                  c_element_op};
+
+            auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle,
+                                           CShuffleNXdlPerWavePerShuffle,
+                                           1,
+                                           1,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            // space filling curve for shuffled blockwise C/D/E
+            constexpr auto sfc_cde_block =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            static_assert(num_access == sfc_cde_block.GetNumOfAccess(), "wrong!");
+            constexpr auto EMThreads =
+                CDEBlockTransferCluster{}.At(I0) * CDEBlockTransferCluster{}.At(I1);
+            constexpr auto EMRepeats = CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl / EMThreads;
+            constexpr auto ENThreads =
+                CDEBlockTransferCluster{}.At(I2) * CDEBlockTransferCluster{}.At(I3);
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                StaticallyIndexedArray<IndexType, EMRepeats> scatter_offsets;
+
+                auto dstidx = sfc_cde_block.GetIndex(access_id);
+                const index_t c_token_pos =
+                    block_m_id * MPerBlock + threadIdx.x / ENThreads * EMRepeats + dstidx(I1);
+                static_for<0, EMRepeats, 1>{}([&](auto m0) {
+                    const index_t fused_token = p_sorted_token_ids[c_token_pos + m0];
+                    IndexType token_offset    = fused_token & 0xffffff;
+                    if constexpr(IsInputGemm)
+                    {
+                        token_offset = token_offset * problem.TopK + (fused_token >> 24);
+                    }
+                    scatter_offsets(m0) = static_cast<IndexType>(token_offset) * problem.N;
+                });
+
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf_fp32,
+                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                cde_block_copy_lds_and_global.Run(
+                    c_ds_desc_refs,
+                    c_ds_buf_refs,
+                    tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                    tie(c_grid_buf),
+                    scatter_offsets);
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto cde_lds_and_global_step =
+                        sfc_cde_block.GetForwardStep(access_id);
+
+                    // move on Ds
+                    static_for<0, NumDTensor, 1>{}([&](auto i) {
+                        cde_block_copy_lds_and_global.MoveSrcSliceWindow(
+                            c_ds_desc_refs, i + I1, cde_lds_and_global_step);
+                    });
+
+                    // move on E
+                    cde_block_copy_lds_and_global.MoveDstSliceWindow(
+                        tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                        I0,
+                        cde_lds_and_global_step);
+                }
+            });
+        }
+    }
+
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run_2Lds(const index_t* p_sorted_token_ids,
+                                    const index_t* p_sorted_expert_ids,
+                                    const index_t* p_max_token_id,
+                                    const ADataType* p_a_grid,
+                                    const BDataType* p_b_grid,
+                                    DsGridPointer& p_ds_grid,
+                                    CDataType* p_c_grid,
+                                    void* p_shared,
+                                    void* p_shared1,
+                                    const Problem& problem,
+                                    AElementwiseOperation a_element_op,
+                                    BElementwiseOperation b_element_op,
+                                    CElementwiseOperation c_element_op)
+    {
+        ignore                           = b_element_op;
+        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+            IsInputGemm ? problem.NumTokens : problem.NumTokens * problem.TopK,
+            problem.MPadded,
+            problem.K,
+            problem.KPadded,
+            problem.StrideA,
+            problem.AK0);
+        const auto b_grid_desc_bpreshuffled =
+            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
+            IsInputGemm ? problem.NumTokens * problem.TopK : problem.NumTokens,
+            problem.MPadded,
+            problem.N,
+            problem.NPadded,
+            problem.StrideC);
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+        const index_t max_token_id    = __builtin_amdgcn_readfirstlane(p_max_token_id[0]);
+        const index_t expert_block_id = NSwizzle ? blockIdx.x / problem.NBlock : blockIdx.y;
+        if(expert_block_id * MPerBlock >= max_token_id)
+            return;
+        const index_t expert_id =
+            __builtin_amdgcn_readfirstlane(p_sorted_expert_ids[expert_block_id]);
+        const auto block_mn = [&]() -> std::pair<int, int> {
+            if constexpr(NSwizzle)
+            {
+                const index_t ecnt_prefix    = p_max_token_id[1 + expert_id];
+                const index_t prefix_block   = ecnt_prefix * problem.NBlock;
+                const index_t ecnt           = p_max_token_id[2 + expert_id] - ecnt_prefix;
+                const index_t expert_swizzle = ecnt > 0 ? ecnt : 1;
+                const index_t bid_new        = blockIdx.x - prefix_block;
+                const index_t nid            = __builtin_amdgcn_readfirstlane(
+                    bid_new % 8 + bid_new / (8 * expert_swizzle) * 8);
+                const index_t mid =
+                    __builtin_amdgcn_readfirstlane(ecnt_prefix + bid_new / 8 % expert_swizzle);
+                return {nid, mid};
+            }
+            else
+            {
+                return {blockIdx.x, blockIdx.y};
+            }
+        }();
+        const index_t block_n_id = block_mn.first;
+        const index_t block_m_id = block_mn.second;
+
+        const index_t token0 =
+            __builtin_amdgcn_readfirstlane(p_sorted_token_ids[block_m_id * MPerBlock] & 0xffffff);
+
+        // constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+        constexpr auto AMThreads  = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+        constexpr auto AK0Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
+        constexpr auto AK1Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I2);
+        constexpr auto AKThreads  = AK0Threads * AK1Threads;
+        constexpr auto AMRepeats  = MPerBlock / AMThreads;
+        const index_t token_pos   = block_m_id * MPerBlock + threadIdx.x / AKThreads * AMRepeats;
+
+        if(token_pos >= max_token_id || expert_block_id * MPerBlock >= max_token_id ||
+           token0 >= problem.NumTokens)
+            return;
+        StaticallyIndexedArray<IndexType, AMRepeats>
+            gather_offsets; //= p_sorted_token_ids[token_pos];
+        static_for<0, AMRepeats, 1>{}([&](auto m0) {
+            const index_t fused_token = p_sorted_token_ids[token_pos + m0];
+            index_t token_offset      = fused_token & 0xffffff;
+            if constexpr(!IsInputGemm)
+            {
+                token_offset = token_offset * problem.TopK + (fused_token >> 24);
+            }
+            gather_offsets(m0) = static_cast<IndexType>(token_offset) * problem.K;
+        });
+        const index_t expert_stride = __builtin_amdgcn_readfirstlane(problem.N * problem.K);
+
+        // N0, K0, Blocksize*KPack
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_n_id * NXdlPerWave);
+
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid + expert_id * expert_stride / BPackedSize,
+            b_grid_desc_bpreshuffled.GetElementSpaceSize());
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        // dummy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+        // A matrix blockwise copy
+        auto a_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1_gather<
+            ThisThreadBlock,
+            AElementwiseOperation,
+            ck::tensor_operation::element_wise::PassThrough,
+            InMemoryDataOperationEnum::Set,
+            Sequence<AK0Number, MPerBlock, AK1Number>,
+            ABlockTransferThreadClusterLengths_AK0_M_AK1,
+            ABlockTransferThreadClusterArrangeOrder,
+            ADataType,
+            LDSTypeA,
+            decltype(a_grid_desc_ak0_m_ak1),
+            decltype(a_block_desc_ak0_m_ak1),
+            ABlockTransferSrcAccessOrder,
+            Sequence<0, 1, 2>,
+            ABlockTransferSrcVectorDim,
+            2,
+            ABlockTransferSrcScalarPerVector,
+            ABlockTransferDstScalarPerVector_AK1,
+            1,
+            1,
+            AThreadTransferSrcResetCoordinateAfterRun,
+            true,
+            IndexType,
+            1,
+            2>(a_grid_desc_ak0_m_ak1,
+               make_multi_index(0, 0, 0),
+               a_element_op,
+               a_block_desc_ak0_m_ak1,
+               make_multi_index(0, 0, 0),
+               ck::tensor_operation::element_wise::PassThrough{},
+               gather_offsets);
+
+        // Thread-wise copy
+        // K0 -> N0/NWave -> NWave -> KLane -> NLane -> KPack
+        auto b_block_buf_ping = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto b_block_buf_pong = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto b_block_bufs = make_tuple(b_block_buf_ping, b_block_buf_pong);
+
+        auto b_blockwise_copy = ThreadwiseTensorSliceTransfer_v2<
+            BDataType,
+            BDataType,
+            decltype(b_grid_desc_bpreshuffled),
+            decltype(b_block_desc_bk0_n_bk1),
+            Sequence<Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}>,
+            Sequence<1, 2, 0, 3>,
+            3,
+            BBlockTransferSrcScalarPerVector,
+            BThreadTransferSrcResetCoordinateAfterRun,
+            true>(b_grid_desc_bpreshuffled,
+                  make_multi_index(n_block_data_idx_on_grid,
+                                   get_warp_local_1d_id() % NWave,
+                                   0,
+                                   KPack * (get_thread_local_1d_id() % warpSize)));
+
+        // LDS allocation for A and B: be careful of alignment
+        // Cast after lds
+        auto a_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+        auto a_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ADataType*>(p_shared1), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+        auto a_block_bufs = make_tuple(a_block_buf_ping, a_block_buf_pong);
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(0, 0, KRepeat, 0);
+
+        // Blockwise GEMM pipeline
+        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
+        auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
+        auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(a_grid_desc_ak0_m_ak1,
+                                                                         a_block_desc_ak0_m_ak1,
+                                                                         a_blockwise_copy,
+                                                                         a_grid_buf,
+                                                                         a_block_bufs,
+                                                                         a_block_slice_copy_step,
+                                                                         b_grid_desc_bpreshuffled,
+                                                                         b_blockwise_copy,
+                                                                         b_grid_buf,
+                                                                         b_block_bufs,
+                                                                         b_block_slice_copy_step,
+                                                                         c_thread_buf,
+                                                                         num_k_block_main_loop);
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
+                        M1,                                      // M1 = MWave
+                        M2,                                      // M2 * M3 * M4 = MPerXdl
+                        M3,
+                        M4)),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
+                        N1,                                      // N1 = NWave
+                        N2))),                                   // N2 = NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   CShuffleDataType,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+
+            using EDataType = CDataType;
+
+            const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N(
+                problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs);
+
+            const auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
+                MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    ds_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+            const auto ds_grid_buf = generate_tuple(
+                [&](auto i) {
+                    using DDataType       = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+                    const DDataType* ptr_ = p_ds_grid[i];
+                    // hack logic here to support different kind of strides. todo fix it.
+                    // ascale t, 1; bscale E, N, 1, move ptr to E
+                    // if(i.value == 1)
+                    // {
+                    //     ptr_ +=
+                    //         expert_id * (problem.StrideDs[1] ? problem.StrideDs[1] * problem.N :
+                    //         1);
+                    // }
+                    return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                        ptr_, ds_grid_desc_m_n[i].GetElementSpaceSize());
+                },
+                Number<NumDTensor>{});
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_desc_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                    Number<NumDTensor>{}));
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_buf_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_buf),
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_buf[i]; },
+                    Number<NumDTensor>{}));
+
+            // tuple of starting index of C/Ds blockwise copy
+            const auto idx_c_ds_block_begin =
+                container_concat(make_tuple(make_multi_index(0, 0, 0, 0)),
+                                 generate_tuple(
+                                     [&](auto) {
+                                         return make_multi_index(block_m_id, 0, block_n_id, 0);
+                                         // return make_multi_index(block_work_idx[I0], 0,
+                                         // block_work_idx[I1], 0);
+                                     },
+                                     Number<NumDTensor>{}));
+
+            const auto e_grid_desc_mblock_mperblock_nblock_nperblock =
+                c_grid_desc_mblock_mperblock_nblock_nperblock;
+
+            using CDEBlockTransferCluster =
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
+            const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation;
+            constexpr index_t scatter_weight_idx  = 3; // hack fix felix
+            auto cde_block_copy_lds_and_global    = ThreadGroupTensorSliceTransfer_v7r3_scatter<
+                ThisThreadBlock,
+                decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
+                Tuple<EDataType>,
+                decltype(c_ds_desc_refs),
+                decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
+                CElementwiseOperation,
+                Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make Sequence
+                                                                            // support arbitray type
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                CDEBlockTransferCluster,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
+                Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
+                3,                    // index_t SrcVectorDim,
+                3,                    // index_t DstVectorDim,
+                CDEShuffleBlockTransferScalarPerVectors,
+                CShuffleBlockTransferScalarPerVector_NPerBlock,
+                sequence_merge_t<
+                    Sequence<true>,
+                    uniform_sequence_gen_t<NumDTensor,
+                                           false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
+                Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
+                IndexType,
+                1,                 // ScatterDim
+                true,              // OutputScatter: false, only use scatter weights
+                scatter_weight_idx // ScatterWeightIdx: ascale
+                >{c_ds_desc_refs,
+                  idx_c_ds_block_begin,
+                  tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                  make_tuple(make_multi_index(0, 0, block_n_id, 0)),
+                  c_element_op};
+
+            auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+            // space filling curve for threadwise C in VGPR
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle,
+                                           CShuffleNXdlPerWavePerShuffle,
+                                           1,
+                                           1,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            // space filling curve for shuffled blockwise C/D/E
+            constexpr auto sfc_cde_block =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            static_assert(num_access == sfc_cde_block.GetNumOfAccess(), "wrong!");
+            constexpr auto EMThreads =
+                CDEBlockTransferCluster{}.At(I0) * CDEBlockTransferCluster{}.At(I1);
+            constexpr auto EMRepeats = CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl / EMThreads;
+            constexpr auto ENThreads =
+                CDEBlockTransferCluster{}.At(I2) * CDEBlockTransferCluster{}.At(I3);
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                StaticallyIndexedArray<IndexType, EMRepeats> scatter_offsets;
+
+                auto dstidx = sfc_cde_block.GetIndex(access_id);
+                const index_t c_token_pos =
+                    block_m_id * MPerBlock + threadIdx.x / ENThreads * EMRepeats + dstidx(I1);
+                static_for<0, EMRepeats, 1>{}([&](auto m0) {
+                    const index_t fused_token = p_sorted_token_ids[c_token_pos + m0];
+                    index_t token_offset      = fused_token & 0xffffff;
+                    if constexpr(IsInputGemm)
+                    {
+                        token_offset = token_offset * problem.TopK + (fused_token >> 24);
+                    }
+                    scatter_offsets(m0) = static_cast<IndexType>(token_offset) * problem.N;
+                });
+
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf,
+                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                cde_block_copy_lds_and_global.Run(
+                    c_ds_desc_refs,
+                    c_ds_buf_refs,
+                    tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                    tie(c_grid_buf),
+                    scatter_offsets);
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto cde_lds_and_global_step =
+                        sfc_cde_block.GetForwardStep(access_id);
+
+                    // move on Ds
+                    static_for<0, NumDTensor, 1>{}([&](auto i) {
+                        cde_block_copy_lds_and_global.MoveSrcSliceWindow(
+                            c_ds_desc_refs, i + I1, cde_lds_and_global_step);
+                    });
+
+                    // move on E
+                    cde_block_copy_lds_and_global.MoveDstSliceWindow(
+                        tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                        I0,
+                        cde_lds_and_global_step);
+                }
+            });
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_sparse_embeddings_forward_layernorm_builtins.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_sparse_embeddings_forward_layernorm_builtins.hpp
new file mode 100644
index 0000000000..7c3e372765
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_sparse_embeddings_forward_layernorm_builtins.hpp
@@ -0,0 +1,322 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp"
+
+namespace ck {
+
+template <typename GridwiseSparseEmbedding,
+          typename EmbType,
+          typename IndexType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename AccDataType,
+          typename OutType,
+          typename OutGridDesc,
+          typename EmbElementwiseOperation,
+          ck::index_t NumEmbeddings>
+#if CK_USE_LAUNCH_BOUNDS
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+    __global__ void kernel_sparse_embeddings_forward_layernorm(
+        OutType* p_out,
+        const ck::Array<EmbType*, NumEmbeddings> p_embs,
+        const ck::Array<IndexType*, NumEmbeddings> p_indexes,
+        const GammaDataType* p_gamma,
+        const BetaDataType* p_beta,
+        const OutGridDesc out_grid_desc,
+        const AccDataType epsilon,
+        const EmbElementwiseOperation emb_elementwise_op)
+{
+    GridwiseSparseEmbedding::Run(
+        p_out, p_embs, p_indexes, p_gamma, p_beta, out_grid_desc, epsilon, emb_elementwise_op);
+}
+
+template <typename EmbType,
+          typename IndexType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename AccDataType,
+          typename OutType,
+          typename OutGridDesc,
+          typename EmbElementwiseOperation,
+          ck::index_t BlockSize,
+          ck::index_t DimClusterSize,
+          ck::index_t RowClusterSize,
+          ck::index_t DimPerBlock,   // Row x Dim, along Dim
+          ck::index_t RowPerBlock,   // Row x Dim, along Row
+          ck::index_t DimThreadSize, // this is actually not vector, but number of registers
+          ck::index_t RowVectorSize,
+          ck::index_t NumEmbeddings>
+struct GridwiseSparseEmbeddingsForwardLayernorm
+{
+    static constexpr auto I0          = Number<0>{};
+    static constexpr auto I1          = Number<1>{};
+    static constexpr auto I2          = Number<2>{};
+    static constexpr auto I3          = Number<3>{};
+    static constexpr index_t WaveSize = 64;
+
+    static_assert(BlockSize == RowClusterSize * DimClusterSize,
+                  "Invalid cluster distribution within block");
+    static_assert(RowClusterSize % WaveSize == 0, "need to be wavewise");
+
+    static_assert(DimPerBlock % (DimClusterSize * DimThreadSize) == 0, "");
+    static_assert(RowPerBlock % (RowClusterSize * RowVectorSize) == 0, "");
+
+    static constexpr auto DimSubBlocks = DimPerBlock / (DimClusterSize * DimThreadSize);
+    static constexpr auto RowSubBlocks = RowPerBlock / (RowClusterSize * RowVectorSize);
+
+    static_assert((DimPerBlock % DimSubBlocks == 0) && (RowPerBlock % RowSubBlocks == 0), "");
+    static constexpr auto DimPerSubBlock = DimPerBlock / DimSubBlocks;
+    static constexpr auto RowPerSubBlock = RowPerBlock / RowSubBlocks;
+
+    using ThreadwiseWolfordDesc2D = decltype(make_naive_tensor_descriptor_packed(make_tuple(
+        Number<DimSubBlocks * DimThreadSize>{}, Number<RowSubBlocks * RowVectorSize>{})));
+
+    using ThreadwiseWolfordDescReduce = decltype(make_naive_tensor_descriptor_packed(
+        make_tuple(Number<DimSubBlocks * DimThreadSize>{})));
+
+    using ThreadwiseWelford =
+        ThreadwiseWelford<AccDataType, ThreadwiseWolfordDesc2D, ThreadwiseWolfordDescReduce>;
+
+    using ThreadClusterLength = Sequence<DimClusterSize, RowClusterSize>;
+
+    using BlockwiseWelford =
+        BlockwiseWelford<AccDataType, BlockSize, ThreadClusterLength, Sequence<0, 1>>;
+
+    __device__ static void Run(OutType* p_out,
+                               const ck::Array<EmbType*, NumEmbeddings> p_embs,
+                               const ck::Array<IndexType*, NumEmbeddings> p_indexes,
+                               const GammaDataType* p_gamma,
+                               const BetaDataType* p_beta,
+                               const OutGridDesc,
+                               const AccDataType epsilon,
+                               const EmbElementwiseOperation emb_elementwise_op)
+    {
+        const index_t thread_local_id = get_thread_local_1d_id();
+        const index_t block_global_id = get_block_1d_id();
+
+        constexpr auto thread_cluster_desc =
+            make_cluster_descriptor(Sequence<DimClusterSize, RowClusterSize>{}, Sequence<0, 1>{});
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
+
+        const auto thread_dim_cluster_id = thread_cluster_idx[I0];
+        const auto thread_row_cluster_id = thread_cluster_idx[I1];
+
+        const auto wave_dim_id = __builtin_amdgcn_readfirstlane(thread_dim_cluster_id / WaveSize);
+
+        const auto index_start = block_global_id * DimPerBlock + wave_dim_id * DimThreadSize;
+
+        auto threadwise_welford       = ThreadwiseWelford();
+        threadwise_welford.max_count_ = RowSubBlocks * RowVectorSize;
+
+        constexpr auto thread_buf_size =
+            DimSubBlocks * DimThreadSize * RowSubBlocks * RowVectorSize;
+        constexpr auto thread_buf_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(DimSubBlocks, DimThreadSize, RowSubBlocks, RowVectorSize));
+        constexpr auto mean_var_buf_size = DimSubBlocks * DimThreadSize;
+        constexpr auto mean_var_buf_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(DimSubBlocks, DimThreadSize));
+        constexpr auto gamma_beta_buf_size = RowSubBlocks * RowVectorSize;
+        constexpr auto gamma_beta_buf_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(RowSubBlocks, RowVectorSize));
+
+        ck::Array<StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, thread_buf_size, true>,
+                  NumEmbeddings>
+            in_thread_bufs;
+        ck::Array<StaticBuffer<AddressSpaceEnum::Vgpr, IndexType, DimPerBlock, true>, NumEmbeddings>
+            index_bufs;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, thread_buf_size, true> acc_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, gamma_beta_buf_size, true>
+            gamma_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, gamma_beta_buf_size, true>
+            beta_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, mean_var_buf_size, true> mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, mean_var_buf_size, true> var_thread_buf;
+
+        auto load_current_sub_row = [&](auto i_dim_sub_, auto i_row_sub_) {
+            ck::Array<vector_type_maker_t<EmbType, RowVectorSize>, NumEmbeddings> emb_vectors;
+            auto emb_a         = emb_vectors[0];
+            using src_vector_t = typename decltype(emb_a)::type;
+            static_for<0, DimThreadSize, 1>{}([&](auto i_dim_vec_) {
+                constexpr auto current_dim = i_dim_sub_ * DimPerSubBlock + i_dim_vec_;
+
+                auto thread_offset = (thread_row_cluster_id + i_row_sub_ * RowClusterSize) *
+                                     sizeof(EmbType) * RowVectorSize;
+                static_for<0, NumEmbeddings, 1>{}([&](auto i_embedding_) {
+                    IndexType index = index_bufs[i_embedding_][Number<current_dim>{}];
+
+                    __amdgpu_buffer_rsrc_t emb_res =
+                        make_wave_buffer_resource_with_default_range_new(p_embs[i_embedding_] +
+                                                                         index * RowPerBlock);
+                    emb_vectors(i_embedding_).template AsType<src_vector_t>()(I0) =
+                        amd_buffer_load_impl<EmbType, RowVectorSize>(emb_res, thread_offset, 0);
+                });
+
+                static_for<0, RowVectorSize, 1>{}([&](auto i_row_vec_) {
+                    constexpr auto register_offset = thread_buf_desc.CalculateOffset(
+                        make_tuple(i_dim_sub_, i_dim_vec_, i_row_sub_, i_row_vec_));
+                    static_for<0, NumEmbeddings, 1>{}([&](auto i_embedding_) {
+                        in_thread_bufs(i_embedding_)(Number<register_offset>{}) =
+                            ck::type_convert<AccDataType>(
+                                emb_vectors[i_embedding_].template AsType<EmbType>()[i_row_vec_]);
+                    });
+                });
+            });
+        };
+
+        auto accumulate_current_sub_row = [&](auto i_dim_sub_, auto i_row_sub_) {
+            static_for<0, DimThreadSize, 1>{}([&](auto i_dim_vec_) {
+                static_for<0, RowVectorSize, 1>{}([&](auto i_row_vec_) {
+                    constexpr auto register_offset = thread_buf_desc.CalculateOffset(
+                        make_tuple(i_dim_sub_, i_dim_vec_, i_row_sub_, i_row_vec_));
+                    auto in_data_refs = generate_tie(
+                        [&](auto i_embedding_) -> const auto& {
+                            return in_thread_bufs(i_embedding_)(Number<register_offset>{});
+                        },
+                        Number<NumEmbeddings>{});
+                    auto out_data_refs = generate_tie(
+                        [&](auto) -> auto& { return acc_thread_buf(Number<register_offset>{}); },
+                        Number<1>{});
+                    unpack2(emb_elementwise_op, out_data_refs, in_data_refs);
+                });
+            });
+        };
+
+        auto threadwise_welford_sub_row = [&](auto i_dim_sub_, auto i_row_sub_) {
+            static_for<0, DimThreadSize, 1>{}([&](auto i_dim_vec_) {
+                static_for<0, RowVectorSize, 1>{}([&](auto i_row_vec_) {
+                    constexpr auto register_offset = thread_buf_desc.CalculateOffset(
+                        make_tuple(i_dim_sub_, i_dim_vec_, i_row_sub_, i_row_vec_));
+                    constexpr auto mean_var_offset =
+                        mean_var_buf_desc.CalculateOffset(make_tuple(i_dim_sub_, i_dim_vec_));
+
+                    threadwise_welford.cur_count_++;
+                    threadwise_welford.Update(mean_thread_buf(Number<mean_var_offset>{}),
+                                              var_thread_buf(Number<mean_var_offset>{}),
+                                              acc_thread_buf(Number<register_offset>{}));
+                });
+            });
+        };
+
+        auto threadwise_normalize_store_out = [&](auto i_dim_sub_, auto i_row_sub_) {
+            __amdgpu_buffer_rsrc_t out_res =
+                make_wave_buffer_resource_with_default_range_new(p_out + index_start * RowPerBlock);
+            static_for<0, DimThreadSize, 1>{}([&](auto i_dim_vec_) {
+                vector_type_maker_t<OutType, RowVectorSize> out_vector;
+                using dst_vector_t = typename decltype(out_vector)::type;
+
+                constexpr auto mean_var_offset =
+                    mean_var_buf_desc.CalculateOffset(make_tuple(i_dim_sub_, i_dim_vec_));
+                auto divisor =
+                    1 / __builtin_amdgcn_sqrtf(var_thread_buf(Number<mean_var_offset>{}) + epsilon);
+                static_for<0, RowVectorSize, 1>{}([&](auto i_row_vec_) {
+                    constexpr auto register_offset = thread_buf_desc.CalculateOffset(
+                        make_tuple(i_dim_sub_, i_dim_vec_, i_row_sub_, i_row_vec_));
+                    constexpr auto gamma_beta_offset =
+                        gamma_beta_buf_desc.CalculateOffset(make_tuple(i_row_sub_, i_row_vec_));
+
+                    auto acc_val = acc_thread_buf[Number<register_offset>{}];
+                    acc_val      = (acc_val - mean_thread_buf(Number<mean_var_offset>{})) * divisor;
+                    acc_val      = acc_val * gamma_thread_buf[Number<gamma_beta_offset>{}] +
+                              beta_thread_buf[Number<gamma_beta_offset>{}];
+
+                    out_vector.template AsType<OutType>()(Number<i_row_vec_>{}) =
+                        type_convert<OutType>(acc_val);
+                });
+
+                index_t thread_offset = (thread_row_cluster_id + i_row_sub_ * RowClusterSize) *
+                                        sizeof(OutType) * RowVectorSize;
+
+                amd_buffer_store_impl<OutType, RowVectorSize>(
+                    out_vector.template AsType<dst_vector_t>()[Number<0>{}],
+                    out_res,
+                    thread_offset,
+                    0);
+            });
+        };
+
+        // first load index
+        ck::static_for<0, DimPerBlock, 1>{}([&](auto i_idx_) {
+            // prefer use s_load
+            ck::static_for<0, NumEmbeddings, 1>{}([&](auto i_embedding_) {
+                index_bufs(i_embedding_)(i_idx_) =
+                    p_indexes[i_embedding_][index_start + i_idx_.value];
+            });
+        });
+
+        // load gamma/beta
+        static_for<0, RowSubBlocks, 1>{}([&](auto i_row_sub_) {
+            vector_type_maker_t<GammaDataType, RowVectorSize> gamma_vector;
+            vector_type_maker_t<BetaDataType, RowVectorSize> beta_vector;
+
+            index_t thread_offset_gamma = (thread_row_cluster_id + i_row_sub_ * RowClusterSize) *
+                                          sizeof(GammaDataType) * RowVectorSize;
+            index_t thread_offset_beta = (thread_row_cluster_id + i_row_sub_ * RowClusterSize) *
+                                         sizeof(BetaDataType) * RowVectorSize;
+
+            __amdgpu_buffer_rsrc_t gamma_res =
+                make_wave_buffer_resource_with_default_range_new(p_gamma);
+            __amdgpu_buffer_rsrc_t beta_res =
+                make_wave_buffer_resource_with_default_range_new(p_beta);
+
+            gamma_vector.template AsType<typename decltype(gamma_vector)::type>()(I0) =
+                amd_buffer_load_impl<GammaDataType, RowVectorSize>(
+                    gamma_res, thread_offset_gamma, 0);
+            beta_vector.template AsType<typename decltype(beta_vector)::type>()(I0) =
+                amd_buffer_load_impl<BetaDataType, RowVectorSize>(beta_res, thread_offset_beta, 0);
+
+            static_for<0, RowVectorSize, 1>{}([&](auto i_row_vec_) {
+                constexpr auto offset =
+                    gamma_beta_buf_desc.CalculateOffset(make_tuple(i_row_sub_, i_row_vec_));
+                gamma_thread_buf(Number<offset>{}) = type_convert<AccDataType>(
+                    gamma_vector.template AsType<GammaDataType>()[Number<i_row_vec_>{}]);
+                beta_thread_buf(Number<offset>{}) = type_convert<AccDataType>(
+                    beta_vector.template AsType<BetaDataType>()[Number<i_row_vec_>{}]);
+            });
+        });
+
+        static_for<0, thread_buf_size, 1>{}(
+            [&](auto I) { acc_thread_buf(I) = type_convert<AccDataType>(0.0f); });
+
+        static_for<0, mean_var_buf_size, 1>{}([&](auto I) {
+            mean_thread_buf(I) = type_convert<AccDataType>(0.0f);
+            var_thread_buf(I)  = type_convert<AccDataType>(0.0f);
+        });
+
+        static_for<0, DimSubBlocks, 1>{}([&](auto i_dim_sub) {
+            load_current_sub_row(i_dim_sub, Number<0>{});
+            static_for<0, RowSubBlocks - 1, 1>{}([&](auto i_row) {
+                load_current_sub_row(i_dim_sub, Number<1>{} + i_row);
+                accumulate_current_sub_row(i_dim_sub, i_row);
+                threadwise_welford_sub_row(i_dim_sub, i_row);
+            });
+            accumulate_current_sub_row(i_dim_sub, Number<RowSubBlocks - 1>{});
+            threadwise_welford_sub_row(i_dim_sub, Number<RowSubBlocks - 1>{});
+
+            // blockwise welford
+            static_for<0, mean_var_buf_size, 1>{}([&](auto I) {
+                if constexpr(I > 0)
+                    block_sync_lds();
+                BlockwiseWelford::Run(
+                    mean_thread_buf(I), var_thread_buf(I), threadwise_welford.cur_count_);
+            });
+
+            // store
+            static_for<0, RowSubBlocks, 1>{}(
+                [&](auto i_row) { threadwise_normalize_store_out(i_dim_sub, i_row); });
+        });
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
index 21315c2567..2255505985 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -189,15 +189,35 @@ struct ThreadwiseTensorSliceTransfer_v1r3
     const ElementwiseOperation element_op_;
 }; // namespace ThreadwiseTensorSliceTransfer_v1r3
 
-// Assume:
-//   1. src:
-//     1. SrcDesc is not known at compile-time
-//     2. SrcBuffer is DynamicBuffer
-//     3. src_slice_origin_idx is not known at compile-time
-//   2. dst:
-//     1. DstDesc is known at compile-time
-//     2. DstBuffer is StaticBuffer
-//     3. dst_slice_origin_idx is known at compile-time
+/**
+ * @brief Helper structure that facilitates transfer of source (grid) data to destination threads.
+ *
+ * @details The following assumptions are made:
+ *  - For Source (Grid) Data:
+ *     1. The source tensor descriptor SrcDesc is not known at compile-time.
+ *     2. The source buffer is a dynamic buffer.
+ *     3. The source slice origin index src_slice_origin_idx is not known at compile-time.
+ *  - For Destination (Thread) Data:
+ *     1. The destination tensor descriptor DstDesc is known at compile-time.
+ *     2. The destination buffer dst_buf is a static buffer.
+ *     3. The destination slice origin index dst_slice_origin_idx is known at compile-time.
+ *
+ * @tparam SrcData The data type of the source tensor.
+ * @tparam DstData The data type of the destination tensor.
+ * @tparam SrcDesc The descriptor type of the source tensor.
+ * @tparam DstDesc The descriptor type of the destination tensor.
+ * @tparam SliceLengths The lengths of the slice to be transferred.
+ * @tparam DimAccessOrder The order of dimension access for the space-filling curve.
+ * @tparam SrcVectorDim The dimension along which vectorized access is performed in the source
+ * tensor.
+ * @tparam SrcScalarPerVector The number of scalar elements per vector in the source tensor.
+ * @tparam SrcScalarStrideInVector Not used.
+ * @tparam SrcResetCoordinateAfterRun controls whether source coordinate is restored after each Run
+ * or rolled back one step in MoveSrcSliceWindow
+ * @tparam InvalidElementAsNaN Whether to fill invalid elements with NaN (only applicable for
+ * floating-point types).
+ *
+ */
 template <typename SrcData,
           typename DstData,
           typename SrcDesc,
@@ -224,6 +244,13 @@ struct ThreadwiseTensorSliceTransfer_v2
 
     using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
 
+    static constexpr index_t PackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<SrcData>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
     __device__ constexpr ThreadwiseTensorSliceTransfer_v2(const SrcDesc& src_desc,
                                                           const Index& src_slice_origin_idx)
         : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin_idx))
@@ -232,6 +259,11 @@ struct ThreadwiseTensorSliceTransfer_v2
                       "wrong! SrcDesc need to known at compile-time");
         static_assert(SliceLengths::At(Number<SrcVectorDim>{}) % SrcScalarPerVector == 0,
                       "wrong! Not divisible");
+
+        if constexpr(is_same_v<remove_cvref_t<SrcData>, pk_i4_t>)
+        {
+            static_assert(SrcScalarPerVector % PackedSize == 0, "pk data N cannot be 1");
+        }
     }
 
     __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
@@ -276,10 +308,10 @@ struct ThreadwiseTensorSliceTransfer_v2
         constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
 
         static_for<0, num_access, 1>{}([&](auto idx_1d) {
-            typename vector_type_maker<SrcData, SrcScalarPerVector>::type src_vector;
+            typename vector_type_maker<SrcData, SrcScalarPerVector / PackedSize>::type src_vector;
 
             using src_vector_t =
-                typename vector_type_maker<SrcData, SrcScalarPerVector>::type::type;
+                typename vector_type_maker<SrcData, SrcScalarPerVector / PackedSize>::type::type;
             constexpr auto src_data_idx = SpaceFillingCurve::GetIndex(idx_1d);
 
             const bool is_src_valid =
@@ -287,10 +319,11 @@ struct ThreadwiseTensorSliceTransfer_v2
 
             // copy data from src_buf into src_vector
             src_vector.template AsType<src_vector_t>()(Number<0>{}) =
-                src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid);
+                src_buf.template Get<src_vector_t>(src_coord_.GetOffset() / PackedSize,
+                                                   is_src_valid);
 
             // copy data from src_vector into dst_buf
-            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+            static_for<0, SrcScalarPerVector / PackedSize, 1>{}([&](auto i) {
                 constexpr index_t dst_offset =
                     dst_desc.CalculateOffset(to_multi_index(dst_slice_origin_idx) + src_data_idx +
                                              i * src_scalar_step_in_vector);
@@ -1465,6 +1498,13 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic
 
     using Index = MultiIndex<nDim>;
 
+    static constexpr index_t PackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<SrcData>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
     __device__ constexpr ThreadwiseTensorSliceTransfer_StaticToStatic(
         const ElementwiseOperation& element_op)
         : element_op_{element_op}
@@ -1485,7 +1525,7 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic
                         const SrcBuffer& src_buf,
                         const DstDesc&,
                         const DstSliceOriginIdx&,
-                        DstBuffer& dst_buf)
+                        DstBuffer& dst_buf) const
     {
         static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
                       "wrong! Desc need to known at compile-time");
@@ -1519,26 +1559,71 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic
 
         constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
 
-        static_for<0, num_access, 1>{}([&](auto idx_1d) {
-            constexpr auto idx_md = SpaceFillingCurve::GetIndex(idx_1d);
+        if constexpr(is_same<remove_cvref_t<SrcData>, pk_i4_t>::value)
+        {
+            static_for<0, num_access, 1>{}([&](auto idx_1d) {
+                typename vector_type_maker<SrcData, DstScalarPerVector / PackedSize>::type
+                    src_tmp_vector;
 
-            // copy data from src_buf into dst_vector
-            static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
-                constexpr index_t src_offset = src_desc.CalculateOffset(
-                    src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
+                constexpr auto idx_md = SpaceFillingCurve::GetIndex(idx_1d);
 
-                constexpr index_t dst_offset = dst_desc.CalculateOffset(
-                    dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
+                // copy data from src_buf into dst_vector
+                static_for<0, DstScalarPerVector / PackedSize, 1>{}([&](auto i) {
+                    constexpr index_t src_offset = src_desc.CalculateOffset(
+                        src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
 
-                DstData v;
+                    src_tmp_vector.template AsType<SrcData>()(i) = src_buf[Number<src_offset>{}];
+                });
 
-                // apply element-wise operation
-                element_op_(v, src_buf[Number<src_offset>{}]);
+                // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
+                // DstData)
+                vector_type_maker_t<DstData, DstScalarPerVector> dst_tmp_vector;
 
-                // apply type convert
-                dst_buf(Number<dst_offset>{}) = v;
+                constexpr index_t pack_size = 8;
+
+                static_assert(DstScalarPerVector % pack_size == 0, "");
+
+                using src_v_t = typename vector_type_maker_t<SrcData, pack_size / PackedSize>::type;
+                using dst_v_t = typename vector_type_maker_t<DstData, pack_size>::type;
+
+                static_for<0, DstScalarPerVector / pack_size, 1>{}([&](auto i) {
+                    ck::tensor_operation::element_wise::PassThroughPack8{}(
+                        dst_tmp_vector.template AsType<dst_v_t>()(i),
+                        src_tmp_vector.template AsType<src_v_t>()[i]);
+                });
+
+                // copy data from dst_tmp_vector into dst_buf
+                static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
+                    constexpr index_t dst_offset = dst_desc.CalculateOffset(
+                        dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
+
+                    dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
+                });
             });
-        });
+        }
+        else
+        {
+            static_for<0, num_access, 1>{}([&](auto idx_1d) {
+                constexpr auto idx_md = SpaceFillingCurve::GetIndex(idx_1d);
+
+                // copy data from src_buf into dst_vector
+                static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
+                    constexpr index_t src_offset = src_desc.CalculateOffset(
+                        src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
+
+                    constexpr index_t dst_offset = dst_desc.CalculateOffset(
+                        dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
+
+                    DstData v;
+
+                    // apply element-wise operation
+                    element_op_(v, src_buf[Number<src_offset>{}]);
+
+                    // apply type convert
+                    dst_buf(Number<dst_offset>{}) = v;
+                });
+            });
+        }
     }
 
     ElementwiseOperation element_op_;
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp
new file mode 100644
index 0000000000..bd6fe772e4
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp
@@ -0,0 +1,942 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+#include "ck/tensor/static_tensor.hpp"
+#include "ck/utility/is_detected.hpp"
+
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_util.hpp"
+
+namespace ck {
+
+// Assume:
+//   1. src_desc and dst_desc are not known at compile-time
+//   2. SrcBuffer and DstBuffer are DynamicBuffer
+//   3. src_slice_origin and dst_slice_origin are not known at compile-time,
+//   4. Use thread buffer
+template <typename SliceLengths,
+          typename SrcElementwiseOperation,
+          typename DstElementwiseOperation,
+          InMemoryDataOperationEnum DstInMemOp,
+          typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t SrcScalarPerVector_,
+          index_t DstScalarPerVector_,
+          index_t SrcScalarStrideInVector,
+          index_t DstScalarStrideInVector,
+          bool SrcResetCoordinateAfterRun, // control whether to move back src coordinate after each
+                                           // RunRead(),  will be fused with MoveSrcSliceWindow to
+                                           // save addr computation
+          bool DstResetCoordinateAfterRun, // control whether to move back dst coordinate after each
+                                           // RunWrite(),  will be fused with MoveDstSliceWindow to
+                                           // save addr computation
+          typename IndexType,
+          index_t GatherDim        = 1,
+          index_t NumThreadScratch = 1>
+struct ThreadwiseTensorSliceTransfer_v3r1_gather
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+    using Index                   = MultiIndex<nDim>;
+
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
+    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
+
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
+    using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
+
+    static constexpr auto I0  = Number<0>{};
+    static constexpr auto I1  = Number<1>{};
+    static constexpr auto I2  = Number<2>{};
+    static constexpr auto I3  = Number<3>{};
+    static constexpr auto I4  = Number<4>{};
+    static constexpr auto I5  = Number<5>{};
+    static constexpr auto I6  = Number<6>{};
+    static constexpr auto I7  = Number<7>{};
+    static constexpr auto I8  = Number<8>{};
+    static constexpr auto I10 = Number<10>{};
+    static constexpr auto I12 = Number<12>{};
+    static constexpr auto I13 = Number<13>{};
+    static constexpr auto I14 = Number<14>{};
+    static constexpr auto I16 = Number<16>{};
+
+    static constexpr index_t PackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<SrcData>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    static constexpr auto SrcScalarPerVector = Number<SrcScalarPerVector_ / PackedSize>{};
+    static constexpr auto DstScalarPerVector = Number<DstScalarPerVector_ / PackedSize>{};
+
+    static constexpr index_t gather_num = SliceLengths{}.At(Number<GatherDim>{});
+
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v3r1_gather(
+        const SrcDesc& src_desc,
+        const Index& src_slice_origin,
+        const SrcElementwiseOperation& src_element_op,
+        const DstDesc& dst_desc,
+        const Index& dst_slice_origin,
+        const DstElementwiseOperation& dst_element_op,
+        const StaticallyIndexedArray<IndexType, gather_num>& gather_offsets)
+        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
+          dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)),
+          src_element_op_(src_element_op),
+          dst_element_op_(dst_element_op),
+          gather_offsets_(gather_offsets)
+    {
+        if constexpr(is_same_v<remove_cvref_t<SrcData>, pk_i4_t>)
+        {
+            static_assert(is_same_v<remove_cvref_t<SrcData>, remove_cvref_t<DstData>>,
+                          "SrcData != DstData");
+
+            static_assert(
+                SrcScalarPerVector_ % PackedSize == 0 && DstScalarPerVector_ % PackedSize == 0,
+                "SrcScalarPerVector_ and DstScalarPerVector_ cannot be 1 for packed data type");
+
+            static_assert(SrcVectorDim == DstVectorDim, "pk_i4_t does not support transpose");
+        }
+    }
+
+    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
+    {
+
+        auto adjusted_origin_idx = [&]() {
+            Index idx;
+            static_for<0, nDim, 1>{}([&](auto i) {
+                idx(i) = i.value == GatherDim ? 0 : src_slice_origin_idx[Number<i>{}];
+            });
+            return idx;
+        }();
+        src_coord_ = make_tensor_coordinate(src_desc, adjusted_origin_idx);
+    }
+
+    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
+    {
+        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+    }
+
+    template <typename SrcBuffer, index_t ThreadScratchId = 0>
+    __device__ void RunRead(const SrcDesc& src_desc,
+                            const SrcBuffer& src_buf,
+                            Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
+    {
+        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
+                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
+                      "wrong!");
+
+        static_assert(
+            is_same<remove_cvref_t<typename SrcBuffer::type>, remove_cvref_t<SrcData>>::value,
+            "wrong! SrcBuffer and SrcData data type are inconsistent");
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector_>{}, Number<nDim>{});
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        static_assert(SliceLengths::At(SrcVectorDim) % (SrcScalarPerVector_) == 0,
+                      "SliceLengths[SrcVectorDim] must be divisible by SrcScalarPerVector");
+
+        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
+        constexpr auto ordered_gather_dim   = src_dim_access_order[GatherDim];
+        constexpr auto ordered_src_access_lengths =
+            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
+
+        // make forward steps
+        const auto src_forward_steps = generate_tuple(
+            [&](auto i) {
+                Index forward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(src_desc, forward_step_idx);
+            },
+            Number<nDim>{});
+
+        // make backward steps
+        const auto src_backward_steps = generate_tuple(
+            [&](auto i) {
+                Index backward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(src_desc, backward_step_idx);
+            },
+            Number<nDim>{});
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_src_access_lengths)>{}([&](auto ordered_src_access_idx) {
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_src_access_idx[I0];
+
+                    static_for<1, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            // calculate src data index
+            constexpr auto src_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
+                                                      : ordered_src_access_lengths[i] - 1 -
+                                                            ordered_src_access_idx[i];
+                });
+
+                return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
+                       src_scalar_per_access;
+            }();
+
+            constexpr auto src_data_idx_seq = generate_sequence_v2(
+                [&](auto i) { return Number<src_data_idx[i]>{}; }, Number<src_data_idx.Size()>{});
+
+            auto gather_offset =
+                gather_offsets_(ordered_src_access_idx[Number<ordered_gather_dim>{}]);
+
+            const IndexType ld_offset = src_coord_.GetOffset() + gather_offset;
+            src_oob_thread_scratch_tuple_(thread_scratch_id)
+                .template SetAsType<bool>(src_data_idx_seq, true);
+
+            using src_vector_type = vector_type_maker_t<SrcData, SrcScalarPerVector>;
+            using src_vector_t    = typename src_vector_type::type;
+
+            auto src_vector_container =
+                src_vector_type{src_buf.template Get<src_vector_t>(ld_offset, true)};
+
+            using dst_vector_type = vector_type_maker_t<DstData, SrcScalarPerVector>;
+            using dst_vector_t    = typename dst_vector_type::type;
+            dst_vector_type op_r_v;
+
+            constexpr auto get_elem_op_vec_len = []() {
+                if constexpr(is_detected<is_pack8_invocable_t, decltype(src_element_op_)>::value)
+                {
+                    if constexpr(decltype(src_element_op_)::is_pack8_invocable)
+                        return math::min(8, SrcScalarPerVector);
+                }
+                else if constexpr(is_detected<is_pack4_invocable_t,
+                                              decltype(src_element_op_)>::value)
+                {
+                    if constexpr(decltype(src_element_op_)::is_pack4_invocable)
+                        return math::min(4, SrcScalarPerVector);
+                }
+                else if constexpr(is_detected<is_pack2_invocable_t,
+                                              decltype(src_element_op_)>::value)
+                {
+                    if constexpr(decltype(src_element_op_)::is_pack2_invocable)
+                        return math::min(2, SrcScalarPerVector);
+                }
+                else
+                {
+                    return 1;
+                }
+            };
+
+            constexpr index_t elem_op_vec_len = get_elem_op_vec_len();
+
+            using src_elem_op_vec_t = typename vector_type<SrcData, elem_op_vec_len>::type;
+            using dst_elem_op_vec_t = typename vector_type<DstData, elem_op_vec_len>::type;
+
+            static_for<0, SrcScalarPerVector / elem_op_vec_len, 1>{}([&](auto idx) {
+                // apply the src elementwise op and convert to DstData under the hood if needed
+                src_element_op_(op_r_v.template AsType<dst_elem_op_vec_t>()(idx),
+                                src_vector_container.template AsType<src_elem_op_vec_t>()[idx]);
+            });
+
+            // copy data from src_vector_container into src_thread_scratch_
+            src_thread_scratch_tuple_(thread_scratch_id)
+                .template SetAsType<dst_vector_t>(src_data_idx_seq,
+                                                  op_r_v.template AsType<dst_vector_t>()[I0]);
+
+            auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &=
+                            ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1;
+                    });
+                    move_on_dim_(i) &= i.value != ordered_gather_dim;
+                });
+
+                return move_on_dim_;
+            }
+            ();
+            // move src coord
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_tensor_coordinate(
+                            src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]);
+                    }
+                    else
+                    {
+                        move_tensor_coordinate(
+                            src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]);
+                    }
+                }
+            });
+        });
+
+        // move src coordinate back to slice origin (or not)
+        if constexpr(SrcResetCoordinateAfterRun)
+        {
+            const auto src_reset_step =
+                make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep());
+
+            move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
+        }
+    }
+
+    template <typename SeqIdx, index_t ThreadScratchId = 0>
+    __device__ constexpr auto
+    GetSrcThreadScratchIdx(Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
+    {
+        using vector_t = typename vector_type_maker<SrcData, SrcScalarPerVector>::type::type;
+        return src_thread_scratch_tuple_(thread_scratch_id).template GetAsType<vector_t>(SeqIdx{});
+    }
+
+    template <index_t ThreadScratchId>
+    __device__ void
+    TransferDataFromSrcThreadScratchToDstThreadScratch(Number<ThreadScratchId> thread_scratch_id)
+    {
+#if !CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE
+        static_ford<SliceLengths>{}([&](auto idx) {
+            dst_thread_scratch_(idx) = src_thread_scratch_tuple_[thread_scratch_id][idx];
+        });
+#else
+
+        // OOB Check
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector_>{}, Number<nDim>{});
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
+
+        constexpr auto ordered_src_access_lengths =
+            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_src_access_lengths)>{}([&](auto ordered_src_access_idx) {
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_src_access_idx[I0];
+
+                    static_for<1, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            // calculate src data index
+            constexpr auto src_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
+                                                      : ordered_src_access_lengths[i] - 1 -
+                                                            ordered_src_access_idx[i];
+                });
+
+                return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
+                       src_scalar_per_access;
+            }();
+
+            constexpr auto src_data_idx_seq = generate_sequence_v2(
+                [&](auto i) { return Number<src_data_idx[i]>{}; }, Number<src_data_idx.Size()>{});
+
+            using vector_t = typename vector_type_maker<DstData, SrcScalarPerVector>::type::type;
+
+            auto op_r = src_thread_scratch_tuple_(thread_scratch_id)
+                            .template GetAsType<vector_t>(src_data_idx_seq);
+
+            auto op_r_v = op_r;
+
+            src_thread_scratch_tuple_(thread_scratch_id)
+                .template SetAsType<vector_t>(src_data_idx_seq, op_r_v);
+        });
+
+        // sub-dword transpose between src_thread_scratch_ and dst_thread_scratch_
+        // TODO make this logic more generic for more sub-dword datatype
+        if constexpr(SrcVectorDim != DstVectorDim &&
+                     ((is_same<half_t, remove_cvref_t<DstData>>::value &&
+                       SrcScalarPerVector % 2 == 0 && DstScalarPerVector % 2 == 0) ||
+                      (is_same<int8_t, remove_cvref_t<DstData>>::value &&
+                       SrcScalarPerVector % 4 == 0 && DstScalarPerVector % 4 == 0) ||
+                      (is_same<f8_t, remove_cvref_t<DstData>>::value &&
+                       SrcScalarPerVector % 4 == 0 && DstScalarPerVector % 4 == 0)))
+        {
+            static_assert(!is_same_v<remove_cvref_t<SrcData>, pk_i4_t>,
+                          "in-register transpose is not supported for pk_i4_t");
+            // each transpose does
+            // DstScalarPerVector # of src vectors in src_thread_scratch_
+            // SrcScalarPerVector # of dst vectors in dst_thread_scratch_
+            constexpr index_t num_src_vector = Number<DstScalarPerVector>{};
+            constexpr index_t num_dst_vector = Number<SrcScalarPerVector>{};
+
+            // Assume SrcVectorDim is not the same as DstVectorDim, so we do transpose
+            // TODO: make this logic generic for all scenario
+            static_assert(SrcVectorDim != DstVectorDim, "wrong");
+
+            constexpr auto src_scalar_step_in_vector = generate_sequence(
+                detail::lambda_scalar_step_in_vector<SrcVectorDim>{}, Number<nDim>{});
+
+            constexpr auto dst_scalar_step_in_vector = generate_sequence(
+                detail::lambda_scalar_step_in_vector<DstVectorDim>{}, Number<nDim>{});
+
+            constexpr auto scalar_per_access = generate_sequence(
+                detail::lambda_scalar_per_access_for_src_and_dst<SrcVectorDim,
+                                                                 SrcScalarPerVector,
+                                                                 DstVectorDim,
+                                                                 DstScalarPerVector>{},
+                Number<nDim>{});
+
+            constexpr auto access_lengths = SliceLengths{} / scalar_per_access;
+
+            static_ford<decltype(access_lengths)>{}([&](auto access_idx) {
+                constexpr auto data_idx = access_idx * scalar_per_access;
+
+                constexpr auto data_idx_seq = generate_sequence_v2(
+                    [&](auto i) { return Number<data_idx[i]>{}; }, Number<nDim>{});
+
+                using src_vector_t = vector_type_maker_t<DstData, SrcScalarPerVector>;
+                using dst_vector_t = vector_type_maker_t<DstData, DstScalarPerVector>;
+
+                // get DstScalarPerVector # of read-only references to src vectors from
+                // src_thread_scratch_
+                const auto src_vector_refs = generate_tie(
+                    [&](auto i) -> const src_vector_t& {
+                        // i increment corresponds to movement in DstVectorDim
+                        return src_thread_scratch_tuple_[thread_scratch_id].GetVectorTypeReference(
+                            data_idx_seq + i * dst_scalar_step_in_vector);
+                    },
+                    Number<num_src_vector>{});
+
+                // get SrcScalarPerVector # of references to dst vectors from dst_thread_scratch_
+                auto dst_vector_refs = generate_tie(
+                    [&](auto i) -> dst_vector_t& {
+                        // i increment corresponds to movement in SrcVectorDim
+                        return dst_thread_scratch_.GetVectorTypeReference(
+                            data_idx_seq + i * src_scalar_step_in_vector);
+                    },
+                    Number<num_dst_vector>{});
+
+                // do data transpose
+                transpose_vectors<DstData, DstScalarPerVector, SrcScalarPerVector>{}(
+                    src_vector_refs, dst_vector_refs);
+            });
+        }
+        else
+        {
+            constexpr auto packed_per_access = generate_sequence(
+                detail::lambda_scalar_per_access<SrcVectorDim, PackedSize>{}, Number<nDim>{});
+
+            constexpr auto packed_access_lengths = SliceLengths{} / packed_per_access;
+
+            static_ford<decltype(packed_access_lengths)>{}([&](auto idx) {
+                dst_thread_scratch_(idx) = src_thread_scratch_tuple_[thread_scratch_id][idx];
+            });
+        }
+#endif
+    }
+
+    template <typename DstBuffer, index_t ThreadScratchId = 0>
+    __device__ void RunWrite(const DstDesc& dst_desc,
+                             DstBuffer& dst_buf,
+                             Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
+    {
+        // if there is transpose, it's done here
+        // if there is oob check, it's done here
+        // TODO move this elsewhere
+        TransferDataFromSrcThreadScratchToDstThreadScratch(thread_scratch_id);
+
+        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
+                          DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
+                      "wrong!");
+
+        static_assert(
+            is_same<remove_cvref_t<typename DstBuffer::type>, remove_cvref_t<DstData>>::value,
+            "wrong! SrcBuffer or DstBuffer data type is wrong");
+
+        // src scalar per access on each dim
+        // TODO: don't use this
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector_>{}, Number<nDim>{});
+
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
+
+        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
+
+        constexpr auto ordered_dst_access_lengths =
+            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
+
+        // make forward steps
+        const auto dst_forward_steps = generate_tuple(
+            [&](auto i) {
+                Index forward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(dst_desc, forward_step_idx);
+            },
+            Number<nDim>{});
+
+        // make backward steps
+        const auto dst_backward_steps = generate_tuple(
+            [&](auto i) {
+                Index backward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(dst_desc, backward_step_idx);
+            },
+            Number<nDim>{});
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_dst_access_lengths)>{}([&](auto ordered_dst_access_idx) {
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_dst_access_idx[I0];
+
+                    static_for<1, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            // calculate dst data index
+            constexpr auto dst_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i]
+                                                      : ordered_dst_access_lengths[i] - 1 -
+                                                            ordered_dst_access_idx[i];
+                });
+
+                return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
+                       dst_scalar_per_access;
+            }();
+
+            constexpr auto dst_data_idx_seq = generate_sequence_v2(
+                [&](auto i) { return Number<dst_data_idx[i]>{}; }, Number<dst_data_idx.Size()>{});
+
+            const bool is_dst_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
+
+            using dst_vector_type = vector_type_maker_t<DstData, DstScalarPerVector>;
+            using dst_vector_t    = typename dst_vector_type::type;
+
+            // copy data from dst_thread_scratch_ into dst_vector_container
+            auto dst_vector_container = dst_vector_type{
+                dst_thread_scratch_.template GetAsType<dst_vector_t>(dst_data_idx_seq)};
+
+            static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
+                DstData dst_v;
+
+                // apply DstElementwiseOperation
+                dst_element_op_(dst_v, dst_vector_container.template AsType<DstData>()[i]);
+
+                dst_vector_container.template AsType<DstData>()(i) = dst_v;
+            });
+
+            // copy data from dst_vector_container to dst_buf
+            dst_buf.template Set<dst_vector_t>(
+                dst_coord_.GetOffset() / PackedSize,
+                is_dst_valid,
+                dst_vector_container.template AsType<dst_vector_t>()[I0]);
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &=
+                            ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // move dst coord
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]);
+                    }
+                    else
+                    {
+                        move_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]);
+                    }
+                }
+            });
+        });
+
+        // move dst coordinate back to slice origin (or not)
+        if constexpr(DstResetCoordinateAfterRun)
+        {
+            const auto dst_reset_step =
+                make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
+
+            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
+        }
+    }
+
+    __device__ static constexpr auto GetSrcCoordinateResetStep()
+    {
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector_>{}, Number<nDim>{});
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
+
+        constexpr auto ordered_src_access_lengths =
+            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_src_access_lengths[I0] - 1;
+
+                static_for<1, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate src data index after last iteration in RunRead(), if it has not being reset by
+        // RunRead()
+        constexpr auto src_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
+                   src_scalar_per_access;
+        }();
+
+        //
+        constexpr auto reset_src_data_step = [&]() {
+            Index reset_src_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                reset_src_data_step_(i) = i.value == GatherDim ? 0 : -src_data_idx[i];
+            });
+
+            return reset_src_data_step_;
+        }();
+        return reset_src_data_step;
+    }
+
+    __device__ static constexpr auto GetDstCoordinateResetStep()
+    {
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector_>{}, Number<nDim>{});
+
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
+
+        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
+
+        constexpr auto ordered_dst_access_lengths =
+            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_dst_access_lengths[I0] - 1;
+
+                static_for<1, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate dst data index after last iteration in RunWrite(), if it has not being reset by
+        // RunWrite()
+        constexpr auto dst_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
+                   dst_scalar_per_access;
+        }();
+
+        //
+        constexpr auto reset_dst_data_step = [&]() {
+            Index reset_dst_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
+
+            return reset_dst_data_step_;
+        }();
+
+        return reset_dst_data_step;
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
+                                       const Index& src_slice_origin_step_idx)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
+                                       const Index& dst_slice_origin_step_idx)
+    {
+        // if dst coord was not reset by RunWrite(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
+                                       : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
+    }
+
+    __device__ static constexpr auto GetSrcThreadScratchDescriptor()
+    {
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector_>{}, Number<nDim>{});
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto src_access_lengths_and_vector_length = container_push_back(
+            sequence_to_tuple_of_number(src_access_lengths), Number<SrcScalarPerVector>{});
+
+        // 1st stage of transforms
+        constexpr auto desc0 =
+            make_naive_tensor_descriptor_packed(src_access_lengths_and_vector_length);
+
+        // 2nd stage of transforms
+        constexpr auto transforms = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == SrcVectorDim)
+                {
+                    return make_merge_transform_v3_division_mod(
+                        make_tuple(src_access_lengths_and_vector_length[i],
+                                   src_access_lengths_and_vector_length[Number<nDim>{}]));
+                }
+                else
+                {
+                    return make_pass_through_transform(src_access_lengths_and_vector_length[i]);
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto low_dim_idss = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == SrcVectorDim)
+                {
+                    return Sequence<i.value, nDim>{};
+                }
+                else
+                {
+                    return Sequence<i.value>{};
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto up_dim_idss =
+            generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
+
+        return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
+    }
+
+    __device__ static constexpr auto GetSrcOOBThreadScratchDescriptor()
+    {
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector_>{}, Number<nDim>{});
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        return make_naive_tensor_descriptor_packed(src_access_lengths);
+    }
+
+    __device__ static constexpr auto GetDstThreadScratchDescriptor()
+    {
+        // 1st stage of transforms
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector_>{}, Number<nDim>{});
+
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
+
+        constexpr auto dst_access_lengths_and_vector_length = container_push_back(
+            sequence_to_tuple_of_number(dst_access_lengths), Number<DstScalarPerVector>{});
+
+        constexpr auto desc0 =
+            make_naive_tensor_descriptor_packed(dst_access_lengths_and_vector_length);
+
+        // 2nd stage of transforms
+        constexpr auto transforms = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == DstVectorDim)
+                {
+                    return make_merge_transform_v3_division_mod(
+                        make_tuple(dst_access_lengths_and_vector_length[i],
+                                   dst_access_lengths_and_vector_length[Number<nDim>{}]));
+                }
+                else
+                {
+                    return make_pass_through_transform(dst_access_lengths_and_vector_length[i]);
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto low_dim_idss = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == DstVectorDim)
+                {
+                    return Sequence<i.value, nDim>{};
+                }
+                else
+                {
+                    return Sequence<i.value>{};
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto up_dim_idss =
+            generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
+
+        return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
+    }
+
+    private:
+    static constexpr auto src_thread_scratch_desc_ = decltype(GetSrcThreadScratchDescriptor()){};
+    static constexpr auto src_oob_thread_scratch_desc_ =
+        decltype(GetSrcThreadScratchDescriptor()){};
+    static constexpr auto dst_thread_scratch_desc_ = decltype(GetDstThreadScratchDescriptor()){};
+
+    using SrcThreadScratch =
+        StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
+                                        DstData, // apply data_convert with SrcThreadScratch
+                                        SrcScalarPerVector,
+                                        decltype(src_thread_scratch_desc_),
+                                        true>;
+
+    using SrcOOBThreadScratch =
+        StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
+                                        bool, // apply data_convert with SrcThreadScratch
+                                        1,
+                                        decltype(src_oob_thread_scratch_desc_),
+                                        true>;
+
+    using DstThreadScratch = StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
+                                                             DstData,
+                                                             DstScalarPerVector,
+                                                             decltype(dst_thread_scratch_desc_),
+                                                             true>;
+
+    StaticallyIndexedArray<SrcThreadScratch, NumThreadScratch> src_thread_scratch_tuple_;
+    StaticallyIndexedArray<SrcOOBThreadScratch, NumThreadScratch> src_oob_thread_scratch_tuple_;
+
+    DstThreadScratch dst_thread_scratch_;
+
+    SrcCoord src_coord_;
+    DstCoord dst_coord_;
+    const SrcElementwiseOperation src_element_op_;
+    const DstElementwiseOperation dst_element_op_;
+    StaticallyIndexedArray<IndexType, gather_num> gather_offsets_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp
new file mode 100644
index 0000000000..7cd0a0fc7f
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp
@@ -0,0 +1,681 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/tensor_space_filling_curve.hpp"
+#include "ck/utility/is_detected.hpp"
+#include "ck/tensor/static_tensor.hpp"
+
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_util.hpp"
+
+namespace ck {
+// Thread-level multi-source, multi-destination tensor slice data movement
+// Assume:
+//   1. All sources and destinations are DynamicBuffer
+//   2. Same VectorDim and ScalerPerVector for all sources and destinations
+//   3. DstInMemOps are per destination tensor
+//   4. ThreadTransferSrcResetCoordinateAfterRunFlags are per source tensor
+//   5. ThreadTransferDstResetCoordinateAfterRunFlags are per destination tensor
+//   6. Does not need to know src_descs and dst_descs at compile-time
+//   7. Does not need to know src_slice_origins and dst_slice_origins at compile-time,
+//
+// Does following things to avoid scratch memory issue
+//   1. Use StaticallyIndexedArray or vector_type instead of C array for thread buffer
+//   2. Pass tensor descritpors by reference (or tuple of references)
+//   3. Does not keep reference to tensor descriptor
+//   4. Does not construct new tensor coordinate when call Run()
+template <typename SrcDatas,
+          typename DstDatas,
+          typename SrcDescs,
+          typename DstDescs,
+          typename ElementwiseOperation,
+          typename DstInMemOps, // Sequence<InMemoryDataOperationEnum ...>
+          typename SliceLengths,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          typename SrcScalarPerVectors,
+          index_t DstScalarPerVector,
+          typename SrcResetCoordinateAfterRunFlags, // Sequence<bool ...>
+          typename DstResetCoordinateAfterRunFlags, // Sequence<bool ...>
+          typename IndexType,
+          index_t ScatterDim       = 1,
+          bool OutputScatter       = true,
+          index_t ScatterWeightIdx = 3,
+          index_t NumThreadScratch = 1>
+struct ThreadwiseTensorSliceTransfer_v7r3_scatter
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr auto SrcScalarPerVector = SrcScalarPerVectors{}[I0];
+
+    static constexpr index_t nDim = SliceLengths::Size();
+
+    static constexpr index_t nSrc = SrcDescs::Size();
+    static constexpr index_t nDst = DstDescs::Size();
+
+    using Index                          = MultiIndex<nDim>;
+    static constexpr index_t scatter_num = SliceLengths{}.At(Number<ScatterDim>{});
+
+    // return a tuple of coordiantes for a tuple of tensor
+    template <typename Descs,
+              typename Indices,
+              enable_if_t<Descs::Size() == Indices::Size(), bool> = false>
+    static constexpr auto MakeCoordinates(const Descs& descs, const Indices& indices)
+    {
+        return generate_tuple([&](auto i) { return make_tensor_coordinate(descs[i], indices[i]); },
+                              Number<Descs::Size()>{});
+    }
+
+    using SrcCoords = decltype(MakeCoordinates(SrcDescs{}, StaticallyIndexedArray<Index, nSrc>{}));
+    using DstCoords = decltype(MakeCoordinates(DstDescs{}, StaticallyIndexedArray<Index, nDst>{}));
+
+    // scalar per access on each dim
+    // FIXME: don't use lambda_scalar_per_access
+    static constexpr auto src_scalar_per_access = generate_sequence(
+        detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+    static constexpr auto dst_scalar_per_access = generate_sequence(
+        detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+    using SrcSpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                   SrcDimAccessOrder,
+                                                   remove_cv_t<decltype(src_scalar_per_access)>,
+                                                   false>;
+
+    using DstSpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                   DstDimAccessOrder,
+                                                   remove_cv_t<decltype(dst_scalar_per_access)>,
+                                                   false>;
+
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v7r3_scatter(
+        const SrcDescs& src_descs,
+        const StaticallyIndexedArray<Index, nSrc>& src_slice_origins,
+        const DstDescs& dst_descs,
+        const StaticallyIndexedArray<Index, nDst>& dst_slice_origins,
+        const ElementwiseOperation& element_op)
+        : src_coords_(MakeCoordinates(src_descs, src_slice_origins)),
+          dst_coords_(MakeCoordinates(dst_descs, dst_slice_origins)),
+          element_op_(element_op)
+    {
+        static_assert(SliceLengths::At(Number<SrcVectorDim>{}) % SrcScalarPerVector == 0,
+                      "wrong! cannot evenly divide");
+
+        static_assert(SliceLengths::At(Number<DstVectorDim>{}) % DstScalarPerVector == 0,
+                      "wrong! cannot evenly divide");
+    }
+
+    template <typename Indices, enable_if_t<SrcDescs::Size() == Indices::Size(), bool> = false>
+    __device__ void SetSrcSliceOrigins(const SrcDescs& src_descs,
+                                       const Indices& src_slice_origin_idxs)
+    {
+        static_for<0, nSrc, 1>{}([&](auto i) {
+            src_coords_(i) = make_tensor_coordinate(src_descs[i], src_slice_origin_idxs[i]);
+        });
+    }
+
+    template <typename Indices, enable_if_t<DstDescs::Size() == Indices::Size(), bool> = false>
+    __device__ void SetDstSliceOrigins(const DstDescs& dst_descs,
+                                       const Indices& dst_slice_origin_idxs)
+    {
+        static_for<0, nDst, 1>{}([&](auto i) {
+            dst_coords_(i) = make_tensor_coordinate(dst_descs[i], dst_slice_origin_idxs[i]);
+        });
+    }
+
+    template <typename DataTypes, index_t ScalarPerVector>
+    __device__ static auto generate_vectors()
+    {
+        auto data_types = DataTypes{};
+
+        constexpr index_t num = data_types.Size();
+
+        return generate_tuple(
+            [&](auto i) {
+                using DataType = remove_cvref_t<decltype(data_types[i])>;
+
+                return vector_type_maker_t<DataType, ScalarPerVector>{};
+            },
+            Number<num>{});
+    }
+
+    // SrcDescs: Tuple<const SrcDesc0&, const SrcDesc1&, ...>
+    // SrcBuffers: Tuple<const SrcBuffer0&, const SrcBuffer1&, ...>
+    template <typename SrcBuffers,
+              index_t ThreadScratchId                                   = 0,
+              enable_if_t<SrcDescs::Size() == SrcBuffers::Size(), bool> = false>
+    __device__ void RunRead(const SrcDescs& src_descs,
+                            const SrcBuffers& src_bufs,
+                            Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
+    {
+        // loop over space-filling curve
+        static_for<0, src_num_access, 1>{}([&](auto iAccess) {
+            auto src_vectors = generate_vectors<SrcDatas, SrcScalarPerVector>();
+            auto elm_vectors = generate_vectors<DstDatas, SrcScalarPerVector>();
+
+            bool oob_val = true;
+
+            // copy data from src_bufs into src_vectors
+            static_for<0, nSrc, 1>{}([&](auto i) {
+                using src_vector_t = typename remove_cvref_t<decltype(src_vectors[i])>::type;
+
+                const bool is_src_valid =
+                    coordinate_has_valid_offset_assuming_visible_index_is_valid(src_descs[i],
+                                                                                src_coords_[i]);
+
+                oob_val = oob_val & is_src_valid;
+                src_vectors(i).template AsType<src_vector_t>()(I0) =
+                    src_bufs[i].template Get<src_vector_t>(src_coords_[i].GetOffset(), true);
+            });
+
+            constexpr auto get_elem_op_vec_len = []() {
+                if constexpr(is_detected<is_pack8_invocable_t, decltype(element_op_)>::value)
+                {
+                    if constexpr(decltype(element_op_)::is_pack8_invocable)
+                        return math::min(8, SrcScalarPerVector);
+                }
+                if constexpr(is_detected<is_pack4_invocable_t, decltype(element_op_)>::value)
+                {
+                    if constexpr(decltype(element_op_)::is_pack4_invocable)
+                        return math::min(4, SrcScalarPerVector);
+                }
+                if constexpr(is_detected<is_pack2_invocable_t, decltype(element_op_)>::value)
+                {
+                    if constexpr(decltype(element_op_)::is_pack2_invocable)
+                        return math::min(2, SrcScalarPerVector);
+                }
+                return 1;
+            };
+
+            constexpr index_t elem_op_vec_len = get_elem_op_vec_len();
+
+            // apply pointwise function
+            static_for<0, SrcScalarPerVector / elem_op_vec_len, 1>{}([&](auto i) {
+                // get reference to src data
+                const auto src_data_refs = generate_tie(
+                    // return type should be lvalue
+                    [&](auto iSrc) -> const auto& {
+                        using SrcData = remove_cvref_t<tuple_element_t<iSrc.value, SrcDatas>>;
+
+                        using elem_op_vec_t = typename vector_type<SrcData, elem_op_vec_len>::type;
+
+                        return src_vectors[iSrc].template AsType<elem_op_vec_t>()[i];
+                    },
+                    Number<nSrc>{});
+
+                // get reference to dst data
+                auto dst_data_refs = generate_tie(
+                    // return type should be lvalue
+                    [&](auto iDst) -> auto& {
+                        using DstData = remove_cvref_t<tuple_element_t<iDst.value, DstDatas>>;
+
+                        using elem_op_vec_t = typename vector_type<DstData, elem_op_vec_len>::type;
+
+                        return elm_vectors(iDst).template AsType<elem_op_vec_t>()(i);
+                    },
+                    Number<nDst>{});
+
+                // apply pointwise function
+                // pointwise function signature:
+                // element_op_(dst_data_refs[I0],
+                //             dst_data_refs[I1],
+                //             ...,
+                //             src_data_refs[I0],
+                //             src_data_refs[I1],
+                //             ...)
+                unpack2(element_op_, dst_data_refs, src_data_refs);
+            });
+
+            elm_vectors_tuple_(thread_scratch_id)(iAccess) = elm_vectors;
+            oob_vectors_tuple_(thread_scratch_id)(iAccess) = oob_val;
+
+            // move coordinate
+            if constexpr(iAccess.value != src_num_access - 1)
+            {
+                constexpr auto forward_step = SrcSpaceFillingCurve::GetForwardStep(iAccess);
+
+                static_for<0, nSrc, 1>{}([&](auto i) {
+                    move_tensor_coordinate(src_descs[i],
+                                           src_coords_(i),
+                                           make_tensor_coordinate_step(src_descs[i], forward_step));
+                });
+            }
+        });
+
+        // move coordinate back to slice origin (or not)
+        static_for<0, nSrc, 1>{}([&](auto i) {
+            if constexpr(SrcResetCoordinateAfterRunFlags::At(i))
+            {
+                const auto src_reset_step =
+                    make_tensor_coordinate_step(src_descs[i], GetSrcCoordinateResetStep());
+
+                move_tensor_coordinate(src_descs[i], src_coords_(i), src_reset_step);
+            }
+        });
+    }
+
+#if 1
+    template <index_t ThreadScratchId = 0>
+    __device__ void OOBCheck(Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
+    {
+        // loop over space-filling curve
+        static_for<0, src_num_access, 1>{}([&](auto iAccess) {
+            auto elm_vectors = elm_vectors_tuple_[thread_scratch_id][iAccess];
+            auto oob_val     = oob_vectors_tuple_[thread_scratch_id][iAccess];
+
+            static_for<0, nDst, 1>{}([&](auto i) {
+                using elm_vector_t = typename remove_cvref_t<decltype(elm_vectors[i])>::type;
+                elm_vectors(i).template AsType<elm_vector_t>()(I0) =
+                    oob_val ? elm_vectors(i).template AsType<elm_vector_t>()[I0] : elm_vector_t{0};
+            });
+
+            elm_vectors_tuple_(thread_scratch_id)(iAccess) = elm_vectors;
+        });
+    }
+#endif
+
+    template <index_t ThreadScratchId = 0>
+    __device__ void
+    TransposeFromElmToDst(Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
+    {
+        using DstData = remove_cvref_t<decltype(DstDatas{}[I0])>;
+
+        using ElmThreadScratch =
+            StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
+                                            DstData,
+                                            SrcScalarPerVector,
+                                            decltype(GetSrcThreadScratchDescriptor()),
+                                            true>;
+        using DstThreadScratch =
+            StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
+                                            DstData,
+                                            DstScalarPerVector,
+                                            decltype(GetDstThreadScratchDescriptor()),
+                                            true>;
+
+        ElmThreadScratch elm_thread_scratch_;
+        DstThreadScratch dst_thread_scratch_;
+
+        elm_thread_scratch_.data_ =
+            bit_cast<decltype(elm_thread_scratch_.data_)>(elm_vectors_tuple_[thread_scratch_id]);
+
+        if constexpr(SrcVectorDim != DstVectorDim &&
+                     ((is_same<half_t, remove_cvref_t<DstData>>::value &&
+                       SrcScalarPerVector % 2 == 0 && DstScalarPerVector % 2 == 0) ||
+                      (is_same<f8_t, remove_cvref_t<DstData>>::value &&
+                       SrcScalarPerVector % 4 == 0 && DstScalarPerVector % 4 == 0) ||
+                      (is_same<int8_t, remove_cvref_t<DstData>>::value &&
+                       SrcScalarPerVector % 4 == 0 && DstScalarPerVector % 4 == 0)))
+        {
+            // each transpose does
+            // DstScalarPerVector # of src vectors in src_thread_scratch_
+            // SrcScalarPerVector # of dst vectors in dst_thread_scratch_
+            constexpr index_t num_src_vector = Number<DstScalarPerVector>{};
+            constexpr index_t num_dst_vector = Number<SrcScalarPerVector>{};
+
+            // Assume SrcVectorDim is not the same as DstVectorDim, so we do transpose
+            // TODO: make this logic generic for all scenario
+
+            constexpr auto src_scalar_step_in_vector = generate_sequence(
+                detail::lambda_scalar_step_in_vector<SrcVectorDim>{}, Number<nDim>{});
+
+            constexpr auto dst_scalar_step_in_vector = generate_sequence(
+                detail::lambda_scalar_step_in_vector<DstVectorDim>{}, Number<nDim>{});
+
+            constexpr auto scalar_per_access = generate_sequence(
+                detail::lambda_scalar_per_access_for_src_and_dst<SrcVectorDim,
+                                                                 SrcScalarPerVector,
+                                                                 DstVectorDim,
+                                                                 DstScalarPerVector>{},
+                Number<nDim>{});
+
+            constexpr auto access_lengths = SliceLengths{} / scalar_per_access;
+
+            static_ford<decltype(access_lengths)>{}([&](auto access_idx) {
+                constexpr auto data_idx = access_idx * scalar_per_access;
+
+                constexpr auto data_idx_seq = generate_sequence_v2(
+                    [&](auto i) { return Number<data_idx[i]>{}; }, Number<nDim>{});
+
+                using src_vector_t = vector_type_maker_t<DstData, SrcScalarPerVector>;
+                using dst_vector_t = vector_type_maker_t<DstData, DstScalarPerVector>;
+
+                // get DstScalarPerVector # of read-only references to src vectors from
+                // src_thread_scratch_
+                const auto src_vector_refs = generate_tie(
+                    [&](auto i) -> const src_vector_t& {
+                        // i increment corresponds to movement in DstVectorDim
+                        return elm_thread_scratch_.GetVectorTypeReference(
+                            data_idx_seq + i * dst_scalar_step_in_vector);
+                    },
+                    Number<num_src_vector>{});
+
+                // get SrcScalarPerVector # of references to dst vectors from
+                // dst_thread_scratch_
+                auto dst_vector_refs = generate_tie(
+                    [&](auto i) -> dst_vector_t& {
+                        // i increment corresponds to movement in SrcVectorDim
+                        return dst_thread_scratch_.GetVectorTypeReference(
+                            data_idx_seq + i * src_scalar_step_in_vector);
+                    },
+                    Number<num_dst_vector>{});
+
+                // do data transpose
+                transpose_vectors<DstData, DstScalarPerVector, SrcScalarPerVector>{}(
+                    src_vector_refs, dst_vector_refs);
+            });
+        }
+        else
+        {
+            static_ford<SliceLengths>{}(
+                [&](auto idx) { dst_thread_scratch_(idx) = elm_thread_scratch_[idx]; });
+        }
+
+        dst_vectors_tuple_(thread_scratch_id) = bit_cast<DstVectorTuple>(dst_thread_scratch_.data_);
+    }
+
+    // DstDescs: Tuple<const DstDesc0&, const DstDesc1&, ...>
+    // DstBuffers: Tuple<const DstBuffer0&, const DstBuffer1&, ...>
+    template <typename DstBuffers,
+              index_t ThreadScratchId                                             = 0,
+              enable_if_t<DstDescs::Size() == 1 && DstBuffers::Size() == 1, bool> = false>
+    __device__ void RunWrite(const DstDescs& dst_descs,
+                             DstBuffers dst_bufs,
+                             StaticallyIndexedArray<IndexType, scatter_num>& scatter_offsets,
+                             Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
+    {
+        OOBCheck(thread_scratch_id);
+        TransposeFromElmToDst(thread_scratch_id);
+
+        // loop over space-filling curve
+        static_for<0, dst_num_access, 1>{}([&](auto iAccess) {
+            auto dst_vectors         = dst_vectors_tuple_[thread_scratch_id][iAccess];
+            IndexType scatter_offset = 0;
+            if constexpr(OutputScatter)
+            {
+                constexpr auto iScatter =
+                    DstSpaceFillingCurve::GetIndex(iAccess)(Number<ScatterDim>{});
+                scatter_offset = scatter_offsets(Number<iScatter>{});
+            }
+            // copy data from buf_vectors into dst_bufs
+            static_for<0, nDst, 1>{}([&](auto i) {
+                using dst_vector_t      = typename remove_cvref_t<decltype(dst_vectors[i])>::type;
+                IndexType dst_offset    = scatter_offset + (dst_coords_[i].GetOffset());
+                const bool is_dst_valid = dst_offset < dst_descs[i].GetElementSpaceSize();
+                // coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_descs[i],
+                //                                                             dst_coords_[i]);
+                constexpr InMemoryDataOperationEnum DstInMemOp =
+                    static_cast<InMemoryDataOperationEnum>(DstInMemOps::At(i.value));
+                dst_bufs(i).template Update<DstInMemOp, dst_vector_t>(
+                    dst_offset, is_dst_valid, dst_vectors[i].template AsType<dst_vector_t>()[I0]);
+            });
+
+            // move coordinate
+            if constexpr(iAccess.value != dst_num_access - 1)
+            {
+                constexpr auto forward_step = DstSpaceFillingCurve::GetForwardStep(iAccess);
+
+                auto forward_step_scatter = [&]() constexpr
+                {
+                    Index step_;
+
+                    static_for<0, nDim, 1>{}([&](auto i) {
+                        step_(i) = (i.value == ScatterDim && OutputScatter) ? 0 : forward_step[i];
+                    });
+
+                    return step_;
+                }
+                ();
+                static_for<0, nDst, 1>{}([&](auto i) {
+                    move_tensor_coordinate(
+                        dst_descs[i],
+                        dst_coords_(i),
+                        make_tensor_coordinate_step(dst_descs[i], forward_step_scatter));
+                });
+            }
+        });
+
+        static_for<0, nDst, 1>{}([&](auto i) {
+            if constexpr(DstResetCoordinateAfterRunFlags::At(i))
+            {
+                const auto dst_reset_step =
+                    make_tensor_coordinate_step(dst_descs[i], GetDstCoordinateResetStep());
+
+                move_tensor_coordinate(dst_descs[i], dst_coords_(i), dst_reset_step);
+            }
+        });
+    }
+
+    // SrcDescs: Tuple<const SrcDesc0&, const SrcDesc1&, ...>
+    // SrcBuffers: Tuple<const SrcBuffer0&, const SrcBuffer1&, ...>
+    // DstDescs: Tuple<const DstDesc0&, const DstDesc1&, ...>
+    // DstBuffers: Tuple<const DstBuffer0&, const DstBuffer1&, ...>
+    template <typename SrcBuffers,
+              typename DstBuffers,
+              enable_if_t<SrcDescs::Size() == SrcBuffers::Size() &&
+                              DstDescs::Size() == DstBuffers::Size(),
+                          bool> = false>
+    __device__ void Run(const SrcDescs& src_descs,
+                        const SrcBuffers& src_bufs,
+                        const DstDescs& dst_descs,
+                        DstBuffers dst_bufs,
+                        StaticallyIndexedArray<IndexType, scatter_num>& scatter_offsets)
+    {
+        RunRead(src_descs, src_bufs);
+        RunWrite(dst_descs, dst_bufs, scatter_offsets);
+    }
+
+    __device__ static constexpr auto GetSrcCoordinateResetStep()
+    {
+        if constexpr(src_num_access == 0)
+        {
+            return typename SrcSpaceFillingCurve::Index{};
+        }
+        else
+        {
+            return SrcSpaceFillingCurve::GetStepBetween(Number<src_num_access - 1>{}, Number<0>{});
+        }
+    }
+
+    __device__ static constexpr auto GetDstCoordinateResetStep()
+    {
+        if constexpr(dst_num_access == 0)
+        {
+            return typename DstSpaceFillingCurve::Index{};
+        }
+        else
+        {
+            constexpr auto reset_step =
+                DstSpaceFillingCurve::GetStepBetween(Number<dst_num_access - 1>{}, Number<0>{});
+            auto reset_step_scatter = [&]() constexpr
+            {
+                Index step_;
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    step_(i) =
+                        (i.value == ScatterDim && OutputScatter) ? 0 : reset_step[Number<i>{}];
+                });
+
+                return step_;
+            }
+            ();
+            return reset_step_scatter;
+        }
+    }
+
+    __device__ static constexpr auto GetSrcThreadScratchDescriptor()
+    {
+        // constexpr auto src_scalar_per_access = generate_sequence(
+        // detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{},
+        // Number<nDim>{});
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto src_access_lengths_and_vector_length = container_push_back(
+            sequence_to_tuple_of_number(src_access_lengths), Number<SrcScalarPerVector>{});
+
+        // 1st stage of transforms
+        constexpr auto desc0 =
+            make_naive_tensor_descriptor_packed(src_access_lengths_and_vector_length);
+
+        // 2nd stage of transforms
+        constexpr auto transforms = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == SrcVectorDim)
+                {
+                    return make_merge_transform_v3_division_mod(
+                        make_tuple(src_access_lengths_and_vector_length[i],
+                                   src_access_lengths_and_vector_length[Number<nDim>{}]));
+                }
+                else
+                {
+                    return make_pass_through_transform(src_access_lengths_and_vector_length[i]);
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto low_dim_idss = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == SrcVectorDim)
+                {
+                    return Sequence<i.value, nDim>{};
+                }
+                else
+                {
+                    return Sequence<i.value>{};
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto up_dim_idss =
+            generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
+
+        return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
+    }
+
+    __device__ static constexpr auto GetDstThreadScratchDescriptor()
+    {
+        // 1st stage of transforms
+        // constexpr auto dst_scalar_per_access = generate_sequence(
+        // detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{},
+        // Number<nDim>{});
+
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
+
+        constexpr auto dst_access_lengths_and_vector_length = container_push_back(
+            sequence_to_tuple_of_number(dst_access_lengths), Number<DstScalarPerVector>{});
+
+        constexpr auto desc0 =
+            make_naive_tensor_descriptor_packed(dst_access_lengths_and_vector_length);
+
+        // 2nd stage of transforms
+        constexpr auto transforms = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == DstVectorDim)
+                {
+                    return make_merge_transform_v3_division_mod(
+                        make_tuple(dst_access_lengths_and_vector_length[i],
+                                   dst_access_lengths_and_vector_length[Number<nDim>{}]));
+                }
+                else
+                {
+                    return make_pass_through_transform(dst_access_lengths_and_vector_length[i]);
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto low_dim_idss = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == DstVectorDim)
+                {
+                    return Sequence<i.value, nDim>{};
+                }
+                else
+                {
+                    return Sequence<i.value>{};
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto up_dim_idss =
+            generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
+
+        return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    template <index_t ISrc>
+    __device__ void MoveSrcSliceWindow(const SrcDescs& src_descs,
+                                       Number<ISrc> iSrc,
+                                       const Index& src_slice_origin_step_idx)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRunFlags::At(iSrc)
+                ? src_slice_origin_step_idx
+                : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(src_descs[iSrc], adjusted_step_idx);
+
+        move_tensor_coordinate(src_descs[iSrc], src_coords_(iSrc), adjusted_step);
+    }
+
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    template <index_t IDst>
+    __device__ void MoveDstSliceWindow(const DstDescs& dst_descs,
+                                       Number<IDst> iDst,
+                                       const Index& dst_slice_origin_step_idx)
+    {
+        // if dst coord was not reset by Run(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            DstResetCoordinateAfterRunFlags::At(iDst)
+                ? dst_slice_origin_step_idx
+                : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
+
+        auto adjusted_step_idx_scatter = [&]() {
+            Index step_;
+            static_for<0, nDim, 1>{}([&](auto i) {
+                step_(i) =
+                    (i.value == ScatterDim && OutputScatter) ? 0 : adjusted_step_idx[Number<i>{}];
+            });
+
+            return step_;
+        }();
+        // is it OK to construct a new step every time?
+        const auto adjusted_step =
+            make_tensor_coordinate_step(dst_descs[iDst], adjusted_step_idx_scatter);
+
+        move_tensor_coordinate(dst_descs[iDst], dst_coords_(iDst), adjusted_step);
+    }
+
+    private:
+    using SrcVectorsType = decltype(generate_vectors<SrcDatas, SrcScalarPerVector>());
+    using ElmVectorsType = decltype(generate_vectors<DstDatas, SrcScalarPerVector>());
+    using DstVectorsType = decltype(generate_vectors<DstDatas, DstScalarPerVector>());
+
+    static constexpr auto src_num_access = SrcSpaceFillingCurve::GetNumOfAccess();
+    static constexpr auto dst_num_access = DstSpaceFillingCurve::GetNumOfAccess();
+
+    using ElmVectorTuple = StaticallyIndexedArray<ElmVectorsType, src_num_access>;
+    using DstVectorTuple = StaticallyIndexedArray<DstVectorsType, dst_num_access>;
+
+    StaticallyIndexedArray<ElmVectorTuple, NumThreadScratch> elm_vectors_tuple_;
+    StaticallyIndexedArray<DstVectorTuple, NumThreadScratch> dst_vectors_tuple_;
+
+    using OOBVectorTuple = StaticallyIndexedArray<bool, src_num_access>;
+    StaticallyIndexedArray<OOBVectorTuple, NumThreadScratch> oob_vectors_tuple_;
+
+    SrcCoords src_coords_;
+    DstCoords dst_coords_;
+    const ElementwiseOperation element_op_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
index 1abae56be4..429df2413f 100644
--- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -22,6 +22,10 @@ enum struct WmmaInstr
     wmma_f32_16x16x16_f16_gfx12,
     wmma_f32_16x16x16_bf16_gfx12,
     wmma_i32_16x16x16_iu8_gfx12,
+    wmma_f32_16x16x16_f8f8_gfx12,
+    wmma_f32_16x16x16_f8bf8_gfx12,
+    wmma_f32_16x16x16_bf8f8_gfx12,
+    wmma_f32_16x16x16_bf8bf8_gfx12,
 };
 
 /*
@@ -400,6 +404,146 @@ struct wmma_type<WmmaInstr::wmma_i32_16x16x16_iu8_gfx12,
     }
 };
 
+template <index_t WaveSize>
+struct wmma_type<WmmaInstr::wmma_f32_16x16x16_f8f8_gfx12,
+                 WaveSize,
+                 typename std::enable_if_t<WaveSize == 32 || WaveSize == 64>>
+{
+    // Absolute fixing property
+    static constexpr index_t m_per_wmma               = 16;
+    static constexpr index_t n_per_wmma               = 16;
+    static constexpr index_t k_per_wmma               = 16;
+    static constexpr index_t acc_data_size            = 4;
+    static constexpr index_t acc_pack_number          = 1;
+    static constexpr index_t num_thread_per_subgroups = n_per_wmma;
+
+    // Wave mode dependent propety
+    static constexpr index_t wave_size              = Number<WaveSize>{};
+    static constexpr index_t num_acc_vgprs_per_wave = m_per_wmma * n_per_wmma / wave_size;
+    static constexpr index_t num_subgroups          = wave_size / num_thread_per_subgroups;
+
+    template <index_t MPerWmma, index_t NPerWmma, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        static_assert(wave_size == 32, "only support wave32 for gfx12 wmma");
+        if constexpr(wave_size == 32)
+        {
+#ifdef __gfx12__
+            intrin_wmma_f32_16x16x16_f8f8_w32_gfx12<MPerWmma, NPerWmma>::Run(a, b, reg_c);
+#else
+            ignore = a;
+            ignore = b;
+            ignore = reg_c;
+#endif
+        }
+    }
+};
+
+template <index_t WaveSize>
+struct wmma_type<WmmaInstr::wmma_f32_16x16x16_f8bf8_gfx12,
+                 WaveSize,
+                 typename std::enable_if_t<WaveSize == 32 || WaveSize == 64>>
+{
+    // Absolute fixing property
+    static constexpr index_t m_per_wmma               = 16;
+    static constexpr index_t n_per_wmma               = 16;
+    static constexpr index_t k_per_wmma               = 16;
+    static constexpr index_t acc_data_size            = 4;
+    static constexpr index_t acc_pack_number          = 1;
+    static constexpr index_t num_thread_per_subgroups = n_per_wmma;
+
+    // Wave mode dependent propety
+    static constexpr index_t wave_size              = Number<WaveSize>{};
+    static constexpr index_t num_acc_vgprs_per_wave = m_per_wmma * n_per_wmma / wave_size;
+    static constexpr index_t num_subgroups          = wave_size / num_thread_per_subgroups;
+
+    template <index_t MPerWmma, index_t NPerWmma, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        static_assert(wave_size == 32, "only support wave32 for gfx12 wmma");
+        if constexpr(wave_size == 32)
+        {
+#ifdef __gfx12__
+            intrin_wmma_f32_16x16x16_f8bf8_w32_gfx12<MPerWmma, NPerWmma>::Run(a, b, reg_c);
+#else
+            ignore = a;
+            ignore = b;
+            ignore = reg_c;
+#endif
+        }
+    }
+};
+
+template <index_t WaveSize>
+struct wmma_type<WmmaInstr::wmma_f32_16x16x16_bf8f8_gfx12,
+                 WaveSize,
+                 typename std::enable_if_t<WaveSize == 32 || WaveSize == 64>>
+{
+    // Absolute fixing property
+    static constexpr index_t m_per_wmma               = 16;
+    static constexpr index_t n_per_wmma               = 16;
+    static constexpr index_t k_per_wmma               = 16;
+    static constexpr index_t acc_data_size            = 4;
+    static constexpr index_t acc_pack_number          = 1;
+    static constexpr index_t num_thread_per_subgroups = n_per_wmma;
+
+    // Wave mode dependent propety
+    static constexpr index_t wave_size              = Number<WaveSize>{};
+    static constexpr index_t num_acc_vgprs_per_wave = m_per_wmma * n_per_wmma / wave_size;
+    static constexpr index_t num_subgroups          = wave_size / num_thread_per_subgroups;
+
+    template <index_t MPerWmma, index_t NPerWmma, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        static_assert(wave_size == 32, "only support wave32 for gfx12 wmma");
+        if constexpr(wave_size == 32)
+        {
+#ifdef __gfx12__
+            intrin_wmma_f32_16x16x16_bf8f8_w32_gfx12<MPerWmma, NPerWmma>::Run(a, b, reg_c);
+#else
+            ignore = a;
+            ignore = b;
+            ignore = reg_c;
+#endif
+        }
+    }
+};
+
+template <index_t WaveSize>
+struct wmma_type<WmmaInstr::wmma_f32_16x16x16_bf8bf8_gfx12,
+                 WaveSize,
+                 typename std::enable_if_t<WaveSize == 32 || WaveSize == 64>>
+{
+    // Absolute fixing property
+    static constexpr index_t m_per_wmma               = 16;
+    static constexpr index_t n_per_wmma               = 16;
+    static constexpr index_t k_per_wmma               = 16;
+    static constexpr index_t acc_data_size            = 4;
+    static constexpr index_t acc_pack_number          = 1;
+    static constexpr index_t num_thread_per_subgroups = n_per_wmma;
+
+    // Wave mode dependent propety
+    static constexpr index_t wave_size              = Number<WaveSize>{};
+    static constexpr index_t num_acc_vgprs_per_wave = m_per_wmma * n_per_wmma / wave_size;
+    static constexpr index_t num_subgroups          = wave_size / num_thread_per_subgroups;
+
+    template <index_t MPerWmma, index_t NPerWmma, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        static_assert(wave_size == 32, "only support wave32 for gfx12 wmma");
+        if constexpr(wave_size == 32)
+        {
+#ifdef __gfx12__
+            intrin_wmma_f32_16x16x16_bf8bf8_w32_gfx12<MPerWmma, NPerWmma>::Run(a, b, reg_c);
+#else
+            ignore = a;
+            ignore = b;
+            ignore = reg_c;
+#endif
+        }
+    }
+};
+
 template <typename src_type_a,
           typename src_type_b,
           typename dst_type,
@@ -463,6 +607,31 @@ struct WmmaSelector
         return WmmaInstr::wmma_i32_16x16x16_iu4;
     }
 #endif
+
+    template <>
+    constexpr auto GetWmma<f8_t, f8_t, float, 16, 16>()
+    {
+        return WmmaInstr::wmma_f32_16x16x16_f8f8_gfx12;
+    }
+
+    template <>
+    constexpr auto GetWmma<f8_t, bf8_t, float, 16, 16>()
+    {
+        return WmmaInstr::wmma_f32_16x16x16_f8bf8_gfx12;
+    }
+
+    template <>
+    constexpr auto GetWmma<bf8_t, f8_t, float, 16, 16>()
+    {
+        return WmmaInstr::wmma_f32_16x16x16_bf8f8_gfx12;
+    }
+
+    template <>
+    constexpr auto GetWmma<bf8_t, bf8_t, float, 16, 16>()
+    {
+        return WmmaInstr::wmma_f32_16x16x16_bf8bf8_gfx12;
+    }
+
     // get_warp_size do not return the correct wavesize, hardcode to 32 as workaround
     static constexpr auto selected_wmma =
         wmma_type<GetWmma<src_type_a, src_type_b, dst_type, MPerWmma, NPerWmma>(), Number<32>{}>{};
@@ -612,14 +781,17 @@ struct WmmaGemm
                 (is_same<src_type_a, bhalf_t>::value && is_same<src_type_b, bhalf_t>::value &&
                  is_same<dst_type, bhalf_t>::value) ||
                 (is_same<src_type_a, int8_t>::value && is_same<src_type_b, int8_t>::value &&
-                 is_same<dst_type, int32_t>::value)
+                 is_same<dst_type, int32_t>::value) ||
+                ((is_same<src_type_a, f8_t>::value || is_same<src_type_a, bf8_t>::value) &&
+                 (is_same<src_type_b, f8_t>::value || is_same<src_type_b, bf8_t>::value) &&
+                 is_same<dst_type, float>::value) ||
 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
-                || (is_same<src_type_a, int4_t>::value && is_same<src_type_b, int4_t>::value &&
-                    is_same<dst_type, int32_t>::value)
+                (is_same<src_type_a, int4_t>::value && is_same<src_type_b, int4_t>::value &&
+                 is_same<dst_type, int32_t>::value) ||
 #endif
-                ,
+                false,
             "base type couple must be (half, float), (bhalf, float), (half, half), (bhalf, bhalf), "
-            "(int8, int32) or (int4, int32)!");
+            "((f8 or bf8, f8 or bf8), float), (int8, int32) or (int4, int32)!");
         static_for<0, KPack / wmma_instr.k_per_wmma, 1>{}([&](auto k) {
             if constexpr(!TransposeC)
             {
diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
index 8c0b950941..08c4e4ba6e 100644
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -793,7 +793,7 @@ struct mfma_type<MfmaInstr::mfma_f32_32x32x64f8f6f4>
     static constexpr index_t num_output_blks     = 1;    // (is_k_reduction == true) ???
     static constexpr index_t m_per_blk           = 32;   // from the instruction
     static constexpr index_t n_per_blk           = 32;   // from the instruction
-    static constexpr index_t k_per_blk           = 32;   // (is_k_reduction == true) ? 64 / num_input_blks
+    static constexpr index_t k_per_blk           = 32;   // (is_k_reduction == true) ? KPerXdlops / num_input_blks
     static constexpr bool is_k_reduction         = true; // ???
     // clang-format on
 
@@ -817,7 +817,7 @@ struct mfma_type<MfmaInstr::mfma_f32_16x16x128f8f6f4>
     static constexpr index_t num_output_blks     = 1;    // (is_k_reduction == true) ???
     static constexpr index_t m_per_blk           = 16;   // from the instruction
     static constexpr index_t n_per_blk           = 16;   // from the instruction
-    static constexpr index_t k_per_blk           = 32;   // (is_k_reduction == true) ? 128 / num_input_blks
+    static constexpr index_t k_per_blk           = 32;   // (is_k_reduction == true) ? KPerXdlops / num_input_blks
     static constexpr bool is_k_reduction         = true; // ???
     // clang-format on
 
@@ -841,19 +841,28 @@ struct mfma_type<MfmaInstr::mfma_scale_f32_32x32x64f8f6f4>
     static constexpr index_t num_output_blks     = 1;    // (is_k_reduction == true) ???
     static constexpr index_t m_per_blk           = 32;   // from the instruction
     static constexpr index_t n_per_blk           = 32;   // from the instruction
-    static constexpr index_t k_per_blk           = 32;   // (is_k_reduction == true) ? 64 / num_input_blks
+    static constexpr index_t k_per_blk           = 32;   // (is_k_reduction == true) ? KPerXdlops / num_input_blks
     static constexpr bool is_k_reduction         = true; // ???
     // clang-format on
 
-    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              class FloatA,
+              class ScaleA,
+              class FloatB,
+              class ScaleB,
+              class FloatC>
     __device__ void run(const FloatA& a,
-                        const int32_t scale_a,
+                        const ScaleA& scale_a,
                         const FloatB& b,
-                        const int32_t scale_b,
+                        const ScaleB& scale_b,
                         FloatC& reg_c) const
     {
+        static_assert(scalar_type<ScaleA>::vector_size == 1, "Expect single scale at this point.");
+        static_assert(scalar_type<ScaleB>::vector_size == 1, "Expect single scale at this point.");
+
         intrin_mfma_scale_f32_32x32x64f8f6f4<MPerXdlops, NPerXdlops>::Run(
-            a, scale_a, b, scale_b, reg_c);
+            a, utils::get_exponent_value(scale_a), b, utils::get_exponent_value(scale_b), reg_c);
     }
 };
 
@@ -870,19 +879,28 @@ struct mfma_type<MfmaInstr::mfma_scale_f32_16x16x128f8f6f4>
     static constexpr index_t num_output_blks     = 1;    // (is_k_reduction == true) ???
     static constexpr index_t m_per_blk           = 16;   // from the instruction
     static constexpr index_t n_per_blk           = 16;   // from the instruction
-    static constexpr index_t k_per_blk           = 32;   // (is_k_reduction == true) ? 128 / num_input_blks
+    static constexpr index_t k_per_blk           = 32;   // (is_k_reduction == true) ? KPerXdlops / num_input_blks
     static constexpr bool is_k_reduction         = true; // ???
     // clang-format on
 
-    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              class FloatA,
+              class ScaleA,
+              class FloatB,
+              class ScaleB,
+              class FloatC>
     __device__ void run(const FloatA& a,
-                        const int32_t scale_a,
+                        const ScaleA& scale_a,
                         const FloatB& b,
-                        const int32_t scale_b,
+                        const ScaleB& scale_b,
                         FloatC& reg_c) const
     {
+        static_assert(scalar_type<ScaleA>::vector_size == 1, "Expect single scale at this point.");
+        static_assert(scalar_type<ScaleB>::vector_size == 1, "Expect single scale at this point.");
+
         intrin_mfma_scale_f32_16x16x128f8f6f4<MPerXdlops, NPerXdlops>::Run(
-            a, scale_a, b, scale_b, reg_c);
+            a, utils::get_exponent_value(scale_a), b, utils::get_exponent_value(scale_b), reg_c);
     }
 };
 
@@ -890,14 +908,16 @@ template <typename base_type,
           index_t MPerXdlops,
           index_t NPerXdlops,
           typename additional_type = base_type,
-          bool is_single_rate_mfma = false>
+          bool is_single_rate_mfma = false,
+          bool is_scale_mfma       = false>
 struct MfmaSelector
 {
     template <typename base_type_,
               index_t MPerXdlops_,
               index_t NPerXdlops_,
               typename additional_type_ = base_type_,
-              bool is_single_rate_mfma_ = false>
+              bool is_single_rate_mfma_ = false,
+              bool is_scale_mfma_       = false>
     static constexpr auto GetMfma();
 
     template <>
@@ -1053,40 +1073,49 @@ struct MfmaSelector
 #endif
     }
 
+    template <>
+    constexpr auto GetMfma<int8_t, 32, 32, int8_t, false>()
+    {
 #if defined(__gfx950__)
-    template <>
-    constexpr auto GetMfma<int8_t, 32, 32>()
-    {
         return MfmaInstr::mfma_i32_32x32x32i8;
-    }
-    template <>
-    constexpr auto GetMfma<int8_t, 16, 16>()
-    {
-        return MfmaInstr::mfma_i32_16x16x64i8;
-    }
 #elif defined(__gfx942__)
-    template <>
-    constexpr auto GetMfma<int8_t, 32, 32>()
-    {
         return MfmaInstr::mfma_i32_32x32x16i8;
-    }
-    template <>
-    constexpr auto GetMfma<int8_t, 16, 16>()
-    {
-        return MfmaInstr::mfma_i32_16x16x32i8;
-    }
 #else
-    template <>
-    constexpr auto GetMfma<int8_t, 32, 32>()
-    {
         return MfmaInstr::mfma_i32_32x32x8i8;
-    }
-    template <>
-    constexpr auto GetMfma<int8_t, 16, 16>()
-    {
-        return MfmaInstr::mfma_i32_16x16x16i8;
-    }
 #endif
+    }
+
+    template <>
+    constexpr auto GetMfma<int8_t, 32, 32, int8_t, true>()
+    {
+#if defined(__gfx942__) || defined(__gfx950__)
+        return MfmaInstr::mfma_i32_32x32x16i8;
+#else
+        return MfmaInstr::mfma_i32_32x32x8i8;
+#endif
+    }
+
+    template <>
+    constexpr auto GetMfma<int8_t, 16, 16, int8_t, false>()
+    {
+#if defined(__gfx950__)
+        return MfmaInstr::mfma_i32_16x16x64i8;
+#elif defined(__gfx942__)
+        return MfmaInstr::mfma_i32_16x16x32i8;
+#else
+        return MfmaInstr::mfma_i32_16x16x16i8;
+#endif
+    }
+
+    template <>
+    constexpr auto GetMfma<int8_t, 16, 16, int8_t, true>()
+    {
+#if defined(__gfx942__) || defined(__gfx950__)
+        return MfmaInstr::mfma_i32_16x16x32i8;
+#else
+        return MfmaInstr::mfma_i32_16x16x16i8;
+#endif
+    }
 
     template <>
     constexpr auto GetMfma<f8_t, 32, 32>()
@@ -1094,12 +1123,30 @@ struct MfmaSelector
         return MfmaInstr::mfma_f32_32x32x16f8f8;
     }
 
+    template <>
+    constexpr auto GetMfma<f8_t, 32, 32, f8_t, false, true>()
+    {
+        return MfmaInstr::mfma_scale_f32_32x32x64f8f6f4;
+    }
+
     template <>
     constexpr auto GetMfma<f8_t, 16, 16>()
     {
         return MfmaInstr::mfma_f32_16x16x32f8f8;
     }
 
+    template <>
+    constexpr auto GetMfma<f8_t, 16, 16, f8_t, false, true>()
+    {
+        return MfmaInstr::mfma_scale_f32_16x16x128f8f6f4;
+    }
+
+    template <>
+    constexpr auto GetMfma<bf8_t, 16, 16, bf8_t, false, true>()
+    {
+        return MfmaInstr::mfma_scale_f32_16x16x128f8f6f4;
+    }
+
     template <>
     constexpr auto GetMfma<bf8_t, 32, 32>()
     {
@@ -1136,8 +1183,12 @@ struct MfmaSelector
         return MfmaInstr::mfma_f32_16x16x32bf8f8;
     }
 
-    static constexpr auto selected_mfma = mfma_type<
-        GetMfma<base_type, MPerXdlops, NPerXdlops, additional_type, is_single_rate_mfma>()>{};
+    static constexpr auto selected_mfma = mfma_type<GetMfma<base_type,
+                                                            MPerXdlops,
+                                                            NPerXdlops,
+                                                            additional_type,
+                                                            is_single_rate_mfma,
+                                                            is_scale_mfma>()>{};
 
     __host__ __device__ constexpr MfmaSelector()
     {
@@ -1185,7 +1236,8 @@ template <typename base_type,
           index_t NPerXdlops,
           index_t KPack,
           typename additional_type = base_type,
-          bool TransposeC          = false>
+          bool TransposeC          = false,
+          bool is_scale_mfma       = false>
 struct XdlopsGemm
 {
     static constexpr auto I0 = Number<0>{};
@@ -1216,7 +1268,7 @@ struct XdlopsGemm
                           MPerXdlops == 64,
                       "Only support GemmMPerXdlops == 4, 8, 16, 32 or 64 for xdlops");
 
-        static_assert(KPack % mfma_instr.k_per_blk == 0, "KPack cannot be divided by k_per_blk");
+        static_assert(KPack % mfma_instr.k_per_blk == 0, "KPack should be a multiple of k_per_blk");
     }
 
     // XDL output supporting C = A * B
@@ -1359,6 +1411,27 @@ struct XdlopsGemm
         });
     }
 
+    template <class FloatA, class ScaleA, class FloatB, class ScaleB, class FloatC>
+    __device__ void Run(const FloatA& p_a_wave,
+                        const ScaleA& a_scale_thread,
+                        const FloatB& p_b_wave,
+                        const ScaleB& b_scale_thread,
+                        FloatC& p_c_thread) const
+    {
+        static_for<0, KPack / mfma_instr.k_per_blk, 1>{}([&](auto k) {
+            if constexpr(!TransposeC)
+            {
+                mfma_instr.template run<MPerXdlops, NPerXdlops>(
+                    p_a_wave[k], a_scale_thread[k], p_b_wave[k], b_scale_thread[k], p_c_thread);
+            }
+            else
+            {
+                mfma_instr.template run<MPerXdlops, NPerXdlops>(
+                    p_b_wave[k], b_scale_thread[k], p_a_wave[k], a_scale_thread[k], p_c_thread);
+            }
+        });
+    }
+
     __device__ static auto GetLaneId() { return get_thread_local_1d_id() % mfma_instr.wave_size; }
 
     __device__ static auto GetBlkIdx()
@@ -1440,12 +1513,14 @@ struct XdlopsGemm
     }
 
     // Falls back to single rate instruction on gfx950 if KPack <= 4; no change on gfx942-
-    static constexpr auto
-        mfma = MfmaSelector < base_type,
-        MPerXdlops, NPerXdlops, additional_type,
-        ((is_same<base_type, half_t>::value || is_same<base_type, bhalf_t>::value) && KPack <= 4)
-            ? true
-            : false > {};
+    static constexpr auto mfma = MfmaSelector < base_type, MPerXdlops, NPerXdlops, additional_type,
+                          (((is_same<base_type, half_t>::value ||
+                             is_same<base_type, bhalf_t>::value) &&
+                            KPack <= 4) ||
+                           (is_same<base_type, int8_t>::value && KPack <= 8))
+                              ? true
+                              : false,
+                          is_scale_mfma > {};
 
     static constexpr auto mfma_instr = mfma.selected_mfma;
 
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
index 8df0d885b9..a191c75099 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -106,9 +106,10 @@ struct TransformConvBwdDataToGemm_v1
             }
             else
             {
-                // Not possible to support even after split N.
-                // Too large tensor.
-                return N;
+                // Split Convolution's N dimension into N workgroups. However
+                // this still might not result in sufficiently small tensor,
+                // but at least later on we could divide the image as well.
+                return 1;
             }
         }
         else
@@ -186,7 +187,8 @@ struct TransformConvBwdDataToGemm_v1
           WTilde_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.WTilde_)},
           ZDot_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.ZDot_)},
           YDot_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.YDot_)},
-          XDot_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.XDot_)}
+          XDot_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.XDot_)},
+          batch_k_{transform_conv_bwd_data_to_gemm_base.batch_k_}
     {
     }
 
@@ -202,7 +204,8 @@ struct TransformConvBwdDataToGemm_v1
                                   const ConvSpatialDimsType& conv_filter_dilations,
                                   const ConvSpatialDimsType& input_left_pads,
                                   const ConvSpatialDimsType& input_right_pads,
-                                  const ConvSpatialDimsType& tildes)
+                                  const ConvSpatialDimsType& tildes,
+                                  const index_t batch_k = 1)
         : Hi_{c_g_n_c_wis_lengths[HIdx]},
           Wi_{c_g_n_c_wis_lengths[WIdx]},
           Ho_{a_g_n_k_wos_lengths[HIdx]},
@@ -230,7 +233,8 @@ struct TransformConvBwdDataToGemm_v1
           InRightPadH_{input_right_pads[HIdx - NonSpatialDimsNum]},
           InRightPadW_{input_right_pads[WIdx - NonSpatialDimsNum]},
           IdxYTilde_{tildes[YIdx - NonSpatialDimsNum]},
-          IdxXTilde_{tildes[XIdx - NonSpatialDimsNum]}
+          IdxXTilde_{tildes[XIdx - NonSpatialDimsNum]},
+          batch_k_{batch_k}
     {
         static_assert(is_same_v<ConvSpatialDimsType, std::array<IndexType, NDimSpatial>> ||
                       is_same_v<ConvSpatialDimsType, ck::Array<IndexType, NDimSpatial>>);
@@ -615,20 +619,22 @@ struct TransformConvBwdDataToGemm_v1
                      ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
                          Filter1x1Stride1Pad0)
         {
-            const index_t AK0 = math::integer_divide_ceil(K_, AK1);
+            const index_t K0PerBlock = GemmKPerBlock / AK1;
+            const index_t AK0 =
+                math::integer_divide_ceil(K_, AK1 * K0PerBlock * batch_k_) * K0PerBlock;
 
             // A: output tensor
             const auto out_gemmak0_gemmmraw_gemmak1_grid_desc = transform_tensor_descriptor(
                 out_grid_desc,
                 make_tuple(make_pass_through_transform(N_ * Do_ * Ho_ * Wo_),
-                           make_unmerge_transform(make_tuple(AK0, AK1))),
+                           make_unmerge_transform(make_tuple(AK0 * batch_k_, AK1))),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<1>{}, Sequence<0, 2>{}));
 
             const auto out_gemmak0_gemmm_gemmak1_grid_desc =
                 ck::tensor_operation::device::PadTensorDescriptor(
                     out_gemmak0_gemmmraw_gemmak1_grid_desc,
-                    make_tuple(AK0, GemmMPerBlock, AK1),
+                    make_tuple(AK0 * batch_k_, GemmMPerBlock, AK1),
                     Sequence<false, DoPadGemmM, false>{});
 
             return out_gemmak0_gemmm_gemmak1_grid_desc;
@@ -718,11 +724,15 @@ struct TransformConvBwdDataToGemm_v1
                         make_tuple(GemmKPerBlock, GemmMPerBlock),
                         Sequence<true, DoPadGemmM>{});
 
-                const index_t AK0 = out_gemmk_gemmm_padded_grid_desc.GetLength(I0) / AK1;
+                const index_t K0PerBlock = GemmKPerBlock / AK1;
+                const index_t AK0 =
+                    math::integer_divide_ceil(out_gemmk_gemmm_padded_grid_desc.GetLength(I0),
+                                              AK1 * K0PerBlock * batch_k_) *
+                    K0PerBlock;
 
                 const auto out_gemmak0_gemmm_gemmak1_grid_desc = transform_tensor_descriptor(
                     out_gemmk_gemmm_padded_grid_desc,
-                    make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                    make_tuple(make_unmerge_transform(make_tuple(AK0 * batch_k_, AK1)),
                                make_pass_through_transform(
                                    out_gemmk_gemmm_padded_grid_desc.GetLength(I1))),
                     make_tuple(Sequence<0>{}, Sequence<1>{}),
@@ -815,11 +825,15 @@ struct TransformConvBwdDataToGemm_v1
                         make_tuple(GemmKPerBlock, GemmMPerBlock),
                         Sequence<true, DoPadGemmM>{});
 
-                const index_t AK0 = out_gemmk_gemmm_padded_grid_desc.GetLength(I0) / AK1;
+                const index_t K0PerBlock = GemmKPerBlock / AK1;
+                const index_t AK0 =
+                    math::integer_divide_ceil(out_gemmk_gemmm_padded_grid_desc.GetLength(I0),
+                                              AK1 * K0PerBlock * batch_k_) *
+                    K0PerBlock;
 
                 const auto out_gemmak0_gemmm_gemmak1_grid_desc = transform_tensor_descriptor(
                     out_gemmk_gemmm_padded_grid_desc,
-                    make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                    make_tuple(make_unmerge_transform(make_tuple(AK0 * batch_k_, AK1)),
                                make_pass_through_transform(
                                    out_gemmk_gemmm_padded_grid_desc.GetLength(I1))),
                     make_tuple(Sequence<0>{}, Sequence<1>{}),
@@ -849,21 +863,23 @@ struct TransformConvBwdDataToGemm_v1
                      ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
                          Filter1x1Stride1Pad0)
         {
-            const index_t BK0 = math::integer_divide_ceil(K_, BK1);
+            const index_t K0PerBlock = GemmKPerBlock / BK1;
+            const index_t BK0 =
+                math::integer_divide_ceil(K_, BK1 * K0PerBlock * batch_k_) * K0PerBlock;
 
             // B: weight tensor
-            const auto wei_gemmbk0_gemmnraw_gemmbk1_grid_desc =
-                transform_tensor_descriptor(make_naive_tensor_descriptor_packed(make_tuple(K_, C_)),
-                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
-                                                       make_pass_through_transform(C_)),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+            const auto wei_gemmbk0_gemmnraw_gemmbk1_grid_desc = transform_tensor_descriptor(
+                make_naive_tensor_descriptor_packed(make_tuple(K_, C_)),
+                make_tuple(make_unmerge_transform(make_tuple(BK0 * batch_k_, BK1)),
+                           make_pass_through_transform(C_)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
             make_naive_tensor_descriptor(make_tuple(N_ * Do_ * Ho_ * Wo_, C_), make_tuple(I0, I1));
 
             const auto wei_gemmbk0_gemmn_gemmbk1_grid_desc =
                 ck::tensor_operation::device::PadTensorDescriptor(
                     wei_gemmbk0_gemmnraw_gemmbk1_grid_desc,
-                    make_tuple(BK0, GemmNPerBlock, BK1),
+                    make_tuple(BK0 * batch_k_, GemmNPerBlock, BK1),
                     Sequence<false, DoPadGemmN, false>{});
 
             return wei_gemmbk0_gemmn_gemmbk1_grid_desc;
@@ -924,11 +940,15 @@ struct TransformConvBwdDataToGemm_v1
                         make_tuple(GemmKPerBlock, GemmNPerBlock),
                         Sequence<true, DoPadGemmN>{});
 
-                const index_t BK0 = wei_gemmk_gemmn_padded_grid_desc.GetLength(I0) / BK1;
+                const index_t K0PerBlock = GemmKPerBlock / BK1;
+                const index_t BK0 =
+                    math::integer_divide_ceil(wei_gemmk_gemmn_padded_grid_desc.GetLength(I0),
+                                              BK1 * K0PerBlock * batch_k_) *
+                    K0PerBlock;
 
                 const auto wei_gemmbk0_gemmn_gemmbk1_grid_desc = transform_tensor_descriptor(
                     wei_gemmk_gemmn_padded_grid_desc,
-                    make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                    make_tuple(make_unmerge_transform(make_tuple(BK0 * batch_k_, BK1)),
                                make_pass_through_transform(
                                    wei_gemmk_gemmn_padded_grid_desc.GetLength(I1))),
                     make_tuple(Sequence<0>{}, Sequence<1>{}),
@@ -1005,11 +1025,15 @@ struct TransformConvBwdDataToGemm_v1
                         make_tuple(GemmKPerBlock, GemmNPerBlock),
                         Sequence<true, DoPadGemmN>{});
 
-                const index_t BK0 = wei_gemmk_gemmn_padded_grid_desc.GetLength(I0) / BK1;
+                const index_t K0PerBlock = GemmKPerBlock / BK1;
+                const index_t BK0 =
+                    math::integer_divide_ceil(wei_gemmk_gemmn_padded_grid_desc.GetLength(I0),
+                                              BK1 * K0PerBlock * batch_k_) *
+                    K0PerBlock;
 
                 const auto wei_gemmbk0_gemm_gemmbk1_grid_desc = transform_tensor_descriptor(
                     wei_gemmk_gemmn_padded_grid_desc,
-                    make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                    make_tuple(make_unmerge_transform(make_tuple(BK0 * batch_k_, BK1)),
                                make_pass_through_transform(
                                    wei_gemmk_gemmn_padded_grid_desc.GetLength(I1))),
                     make_tuple(Sequence<0>{}, Sequence<1>{}),
@@ -1354,6 +1378,7 @@ struct TransformConvBwdDataToGemm_v1
     IndexType ZTilde_, YTilde_, XTilde_;
     IndexType DTilde_, HTilde_, WTilde_;
     IndexType ZDot_, YDot_, XDot_;
+    index_t batch_k_;
 };
 
 } // namespace tensor_operation
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
index 3db94deccb..c291f3994c 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
@@ -83,9 +83,10 @@ struct TransformConvFwdToGemm
             }
             else
             {
-                // Not possible to support even after split N.
-                // Too large tensor.
-                return N;
+                // Split Convolution's N dimension into N workgroups. However
+                // this still might not result in sufficiently small tensor,
+                // but at least later on we could divide the image as well.
+                return 1;
             }
         }
         else
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_ngchw_to_nhwgc.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_ngchw_to_nhwgc.hpp
index 8100b0bdbd..0f28fe8169 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_ngchw_to_nhwgc.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_ngchw_to_nhwgc.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -12,6 +12,15 @@
 namespace ck {
 namespace tensor_operation {
 
+/*
+ * Transform Convolution NGCHW to NHWGC. We transform [N, G, C, H, W] tensor
+ * descriptor to [N * G * C, H * W] (input or output image). The first
+ * dimension is store dimension, the second one is load dimension. For
+ * NHWGC to NGCHW load and store are reverted. For weight we transform
+ * [G, K, C, Y, X] to [G * K * Y * X, C]. First dim is load dimension,
+ * second dim is store dimension.
+ */
+
 template <typename ALayout,
           typename BLayout,
           typename ELayout,
@@ -28,11 +37,13 @@ struct TransformConvNGCHWToNHWGC
     static constexpr auto I5 = Number<5>{};
 
     template <ck::index_t NDim, typename ck::enable_if<NDim == 1, bool>::type = false>
-    static auto MakeNGCHWTransposeDesc(std::array<ck::index_t, NDimSpatial + 3> g_n_c_wis_lengths,
-                                       std::array<ck::index_t, NDimSpatial + 3> g_n_c_wis_strides)
+    static auto
+    MakeNGCHWTransposeDesc(const std::array<ck::index_t, NDimSpatial + 3>& g_n_c_wis_lengths,
+                           const std::array<ck::index_t, NDimSpatial + 3>& g_n_c_wis_strides,
+                           const index_t split_n_size = 1)
     {
         const index_t& G  = g_n_c_wis_lengths[I0];
-        const index_t& N  = g_n_c_wis_lengths[I1];
+        const index_t N   = g_n_c_wis_lengths[I1] / split_n_size;
         const index_t& C  = g_n_c_wis_lengths[I2];
         const index_t& Wi = g_n_c_wis_lengths[I3];
 
@@ -54,11 +65,13 @@ struct TransformConvNGCHWToNHWGC
     }
 
     template <ck::index_t NDim, typename ck::enable_if<NDim == 1, bool>::type = false>
-    static auto MakeNHWGCTransposeDesc(std::array<ck::index_t, NDimSpatial + 3> g_n_c_wis_lengths,
-                                       std::array<ck::index_t, NDimSpatial + 3> g_n_c_wis_strides)
+    static auto
+    MakeNHWGCTransposeDesc(const std::array<ck::index_t, NDimSpatial + 3>& g_n_c_wis_lengths,
+                           const std::array<ck::index_t, NDimSpatial + 3>& g_n_c_wis_strides,
+                           const index_t split_n_size = 1)
     {
         const index_t& G  = g_n_c_wis_lengths[I0];
-        const index_t& N  = g_n_c_wis_lengths[I1];
+        const index_t N   = g_n_c_wis_lengths[I1] / split_n_size;
         const index_t& C  = g_n_c_wis_lengths[I2];
         const index_t& Wi = g_n_c_wis_lengths[I3];
 
@@ -80,11 +93,13 @@ struct TransformConvNGCHWToNHWGC
     }
 
     template <ck::index_t NDim, typename ck::enable_if<NDim == 2, bool>::type = false>
-    static auto MakeNGCHWTransposeDesc(std::array<ck::index_t, NDimSpatial + 3> g_n_c_wis_lengths,
-                                       std::array<ck::index_t, NDimSpatial + 3> g_n_c_wis_strides)
+    static auto
+    MakeNGCHWTransposeDesc(const std::array<ck::index_t, NDimSpatial + 3>& g_n_c_wis_lengths,
+                           const std::array<ck::index_t, NDimSpatial + 3>& g_n_c_wis_strides,
+                           const index_t split_n_size = 1)
     {
         const index_t& G  = g_n_c_wis_lengths[I0];
-        const index_t& N  = g_n_c_wis_lengths[I1];
+        const index_t N   = g_n_c_wis_lengths[I1] / split_n_size;
         const index_t& C  = g_n_c_wis_lengths[I2];
         const index_t& Hi = g_n_c_wis_lengths[I3];
         const index_t& Wi = g_n_c_wis_lengths[I4];
@@ -108,11 +123,13 @@ struct TransformConvNGCHWToNHWGC
     }
 
     template <ck::index_t NDim, typename ck::enable_if<NDim == 2, bool>::type = false>
-    static auto MakeNHWGCTransposeDesc(std::array<ck::index_t, NDimSpatial + 3> g_n_c_wis_lengths,
-                                       std::array<ck::index_t, NDimSpatial + 3> g_n_c_wis_strides)
+    static auto
+    MakeNHWGCTransposeDesc(const std::array<ck::index_t, NDimSpatial + 3>& g_n_c_wis_lengths,
+                           const std::array<ck::index_t, NDimSpatial + 3>& g_n_c_wis_strides,
+                           const index_t split_n_size = 1)
     {
         const index_t& G  = g_n_c_wis_lengths[I0];
-        const index_t& N  = g_n_c_wis_lengths[I1];
+        const index_t N   = g_n_c_wis_lengths[I1] / split_n_size;
         const index_t& C  = g_n_c_wis_lengths[I2];
         const index_t& Hi = g_n_c_wis_lengths[I3];
         const index_t& Wi = g_n_c_wis_lengths[I4];
@@ -136,11 +153,13 @@ struct TransformConvNGCHWToNHWGC
     }
 
     template <ck::index_t NDim, typename ck::enable_if<NDim == 3, bool>::type = false>
-    static auto MakeNGCHWTransposeDesc(std::array<ck::index_t, NDimSpatial + 3> g_n_c_wis_lengths,
-                                       std::array<ck::index_t, NDimSpatial + 3> g_n_c_wis_strides)
+    static auto
+    MakeNGCHWTransposeDesc(const std::array<ck::index_t, NDimSpatial + 3>& g_n_c_wis_lengths,
+                           const std::array<ck::index_t, NDimSpatial + 3>& g_n_c_wis_strides,
+                           const index_t split_n_size = 1)
     {
         const index_t& G  = g_n_c_wis_lengths[I0];
-        const index_t& N  = g_n_c_wis_lengths[I1];
+        const index_t N   = g_n_c_wis_lengths[I1] / split_n_size;
         const index_t& C  = g_n_c_wis_lengths[I2];
         const index_t& Di = g_n_c_wis_lengths[I3];
         const index_t& Hi = g_n_c_wis_lengths[I4];
@@ -167,11 +186,13 @@ struct TransformConvNGCHWToNHWGC
     }
 
     template <ck::index_t NDim, typename ck::enable_if<NDim == 3, bool>::type = false>
-    static auto MakeNHWGCTransposeDesc(std::array<ck::index_t, NDimSpatial + 3> g_n_c_wis_lengths,
-                                       std::array<ck::index_t, NDimSpatial + 3> g_n_c_wis_strides)
+    static auto
+    MakeNHWGCTransposeDesc(const std::array<ck::index_t, NDimSpatial + 3>& g_n_c_wis_lengths,
+                           const std::array<ck::index_t, NDimSpatial + 3>& g_n_c_wis_strides,
+                           const index_t split_n_size = 1)
     {
         const index_t& G  = g_n_c_wis_lengths[I0];
-        const index_t& N  = g_n_c_wis_lengths[I1];
+        const index_t N   = g_n_c_wis_lengths[I1] / split_n_size;
         const index_t& C  = g_n_c_wis_lengths[I2];
         const index_t& Di = g_n_c_wis_lengths[I3];
         const index_t& Hi = g_n_c_wis_lengths[I4];
@@ -197,11 +218,185 @@ struct TransformConvNGCHWToNHWGC
             merged_desc, make_tuple(MPerThread, NPerThread), Sequence<true, true>{});
     }
 
-    static auto TransposeStrides(const std::array<index_t, NDimSpatial + 3>& g_n_c_wis_lengths,
-                                 const std::array<index_t, NDimSpatial + 3>& g_n_c_wis_strides)
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 1, bool>::type = false>
+    static auto
+    MakeGKCYXTransposeDesc(const std::array<ck::index_t, NDimSpatial + 3>& g_k_c_wis_lengths,
+                           const std::array<ck::index_t, NDimSpatial + 3>& g_k_c_wis_strides)
     {
-        if constexpr(device::is_NGCHW_GKYXC_NGKHW<ALayout, BLayout, ELayout>() ||
-                     device::is_NGCDHW_GKZYXC_NGKDHW<ALayout, BLayout, ELayout>())
+        const index_t& G = g_k_c_wis_lengths[I0];
+        const index_t& K = g_k_c_wis_lengths[I1];
+        const index_t& C = g_k_c_wis_lengths[I2];
+        const index_t& X = g_k_c_wis_lengths[I3];
+
+        const index_t& GStride = g_k_c_wis_strides[I0];
+        const index_t& KStride = g_k_c_wis_strides[I1];
+        const index_t& CStride = g_k_c_wis_strides[I2];
+        const index_t& XStride = g_k_c_wis_strides[I3];
+
+        const auto desc = make_naive_tensor_descriptor(
+            make_tuple(G, K, C, X), make_tuple(GStride, KStride, CStride, XStride));
+        const auto merged_desc = transform_tensor_descriptor(
+            desc,
+            make_tuple(make_merge_transform(make_tuple(G, K, X)), make_pass_through_transform(C)),
+            make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+        return device::PadTensorDescriptor(
+            merged_desc, make_tuple(MPerThread, NPerThread), Sequence<true, true>{});
+    }
+
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 1, bool>::type = false>
+    static auto
+    MakeGKYXCTransposeDesc(const std::array<ck::index_t, NDimSpatial + 3>& g_k_c_wis_lengths,
+                           const std::array<ck::index_t, NDimSpatial + 3>& g_k_c_wis_strides)
+    {
+        const index_t& G = g_k_c_wis_lengths[I0];
+        const index_t& K = g_k_c_wis_lengths[I1];
+        const index_t& C = g_k_c_wis_lengths[I2];
+        const index_t& X = g_k_c_wis_lengths[I3];
+
+        const index_t& GStride = g_k_c_wis_strides[I0];
+        const index_t KStride  = g_k_c_wis_strides[I1];
+        const index_t CStride  = 1;
+        const index_t XStride  = C;
+
+        const auto desc = make_naive_tensor_descriptor(
+            make_tuple(G, K, C, X), make_tuple(GStride, KStride, CStride, XStride));
+        const auto merged_desc = transform_tensor_descriptor(
+            desc,
+            make_tuple(make_merge_transform(make_tuple(G, K, X)), make_pass_through_transform(C)),
+            make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+        return device::PadTensorDescriptor(
+            merged_desc, make_tuple(MPerThread, NPerThread), Sequence<true, true>{});
+    }
+
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 2, bool>::type = false>
+    static auto
+    MakeGKCYXTransposeDesc(const std::array<ck::index_t, NDimSpatial + 3>& g_k_c_wis_lengths,
+                           const std::array<ck::index_t, NDimSpatial + 3>& g_k_c_wis_strides)
+    {
+        const index_t& G = g_k_c_wis_lengths[I0];
+        const index_t& K = g_k_c_wis_lengths[I1];
+        const index_t& C = g_k_c_wis_lengths[I2];
+        const index_t& Y = g_k_c_wis_lengths[I3];
+        const index_t& X = g_k_c_wis_lengths[I4];
+
+        const index_t& GStride = g_k_c_wis_strides[I0];
+        const index_t& KStride = g_k_c_wis_strides[I1];
+        const index_t& CStride = g_k_c_wis_strides[I2];
+        const index_t& YStride = g_k_c_wis_strides[I3];
+        const index_t& XStride = g_k_c_wis_strides[I4];
+
+        const auto desc = make_naive_tensor_descriptor(
+            make_tuple(G, K, C, Y, X), make_tuple(GStride, KStride, CStride, YStride, XStride));
+        const auto merged_desc =
+            transform_tensor_descriptor(desc,
+                                        make_tuple(make_merge_transform(make_tuple(G, K, Y, X)),
+                                                   make_pass_through_transform(C)),
+                                        make_tuple(Sequence<0, 1, 3, 4>{}, Sequence<2>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        return device::PadTensorDescriptor(
+            merged_desc, make_tuple(MPerThread, NPerThread), Sequence<true, true>{});
+    }
+
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 2, bool>::type = false>
+    static auto
+    MakeGKYXCTransposeDesc(const std::array<ck::index_t, NDimSpatial + 3>& g_k_c_wis_lengths,
+                           const std::array<ck::index_t, NDimSpatial + 3>& g_k_c_wis_strides)
+    {
+        const index_t& G = g_k_c_wis_lengths[I0];
+        const index_t& K = g_k_c_wis_lengths[I1];
+        const index_t& C = g_k_c_wis_lengths[I2];
+        const index_t& Y = g_k_c_wis_lengths[I3];
+        const index_t& X = g_k_c_wis_lengths[I4];
+
+        const index_t& GStride = g_k_c_wis_strides[I0];
+        const index_t KStride  = g_k_c_wis_strides[I1];
+        const index_t CStride  = 1;
+        const index_t YStride  = X * C;
+        const index_t XStride  = C;
+
+        const auto desc = make_naive_tensor_descriptor(
+            make_tuple(G, K, C, Y, X), make_tuple(GStride, KStride, CStride, YStride, XStride));
+        const auto merged_desc =
+            transform_tensor_descriptor(desc,
+                                        make_tuple(make_merge_transform(make_tuple(G, K, Y, X)),
+                                                   make_pass_through_transform(C)),
+                                        make_tuple(Sequence<0, 1, 3, 4>{}, Sequence<2>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        return device::PadTensorDescriptor(
+            merged_desc, make_tuple(MPerThread, NPerThread), Sequence<true, true>{});
+    }
+
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 3, bool>::type = false>
+    static auto
+    MakeGKCYXTransposeDesc(const std::array<ck::index_t, NDimSpatial + 3>& g_k_c_wis_lengths,
+                           const std::array<ck::index_t, NDimSpatial + 3>& g_k_c_wis_strides)
+    {
+        const index_t& G = g_k_c_wis_lengths[I0];
+        const index_t& K = g_k_c_wis_lengths[I1];
+        const index_t& C = g_k_c_wis_lengths[I2];
+        const index_t& Z = g_k_c_wis_lengths[I3];
+        const index_t& Y = g_k_c_wis_lengths[I4];
+        const index_t& X = g_k_c_wis_lengths[I5];
+
+        const index_t& GStride = g_k_c_wis_strides[I0];
+        const index_t& KStride = g_k_c_wis_strides[I1];
+        const index_t& CStride = g_k_c_wis_strides[I2];
+        const index_t& ZStride = g_k_c_wis_strides[I3];
+        const index_t& YStride = g_k_c_wis_strides[I4];
+        const index_t& XStride = g_k_c_wis_strides[I5];
+
+        const auto desc = make_naive_tensor_descriptor(
+            make_tuple(G, K, C, Z, Y, X),
+            make_tuple(GStride, KStride, CStride, ZStride, YStride, XStride));
+        const auto merged_desc =
+            transform_tensor_descriptor(desc,
+                                        make_tuple(make_merge_transform(make_tuple(G, K, Z, Y, X)),
+                                                   make_pass_through_transform(C)),
+                                        make_tuple(Sequence<0, 1, 3, 4, 5>{}, Sequence<2>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        return device::PadTensorDescriptor(
+            merged_desc, make_tuple(MPerThread, NPerThread), Sequence<true, true>{});
+    }
+
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 3, bool>::type = false>
+    static auto
+    MakeGKYXCTransposeDesc(const std::array<ck::index_t, NDimSpatial + 3>& g_k_c_wis_lengths,
+                           const std::array<ck::index_t, NDimSpatial + 3>& g_k_c_wis_strides)
+    {
+        const index_t& G = g_k_c_wis_lengths[I0];
+        const index_t& K = g_k_c_wis_lengths[I1];
+        const index_t& C = g_k_c_wis_lengths[I2];
+        const index_t& Z = g_k_c_wis_lengths[I3];
+        const index_t& Y = g_k_c_wis_lengths[I4];
+        const index_t& X = g_k_c_wis_lengths[I5];
+
+        const index_t& GStride = g_k_c_wis_strides[I0];
+        const index_t KStride  = g_k_c_wis_strides[I1];
+        const index_t CStride  = 1;
+        const index_t ZStride  = Y * X * C;
+        const index_t YStride  = X * C;
+        const index_t XStride  = C;
+
+        const auto desc = make_naive_tensor_descriptor(
+            make_tuple(G, K, C, Z, Y, X),
+            make_tuple(GStride, KStride, CStride, ZStride, YStride, XStride));
+        const auto merged_desc =
+            transform_tensor_descriptor(desc,
+                                        make_tuple(make_merge_transform(make_tuple(G, K, Z, Y, X)),
+                                                   make_pass_through_transform(C)),
+                                        make_tuple(Sequence<0, 1, 3, 4, 5>{}, Sequence<2>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        return device::PadTensorDescriptor(
+            merged_desc, make_tuple(MPerThread, NPerThread), Sequence<true, true>{});
+    }
+
+    static auto TransposeInOutStrides(const std::array<index_t, NDimSpatial + 3>& g_n_c_wis_lengths,
+                                      const std::array<index_t, NDimSpatial + 3>& g_n_c_wis_strides)
+    {
+        if constexpr(device::is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
+                     device::is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>())
         {
             std::array<index_t, NDimSpatial + 3> g_n_c_wis_strides_transposed;
             const auto G = g_n_c_wis_lengths[I0];
@@ -230,6 +425,41 @@ struct TransformConvNGCHWToNHWGC
             return g_n_c_wis_strides;
         }
     }
+
+    static auto
+    TransposeWeiStrides(const std::array<ck::index_t, NDimSpatial + 3>& g_k_c_wis_lengths,
+                        const std::array<ck::index_t, NDimSpatial + 3>& g_k_c_wis_strides)
+    {
+        if constexpr(device::is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>() ||
+                     device::is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>())
+        {
+            std::array<index_t, NDimSpatial + 3> g_k_c_wis_strides_transposed = g_k_c_wis_strides;
+            const index_t C = g_k_c_wis_lengths[I2];
+
+            if constexpr(NDimSpatial == 2)
+            {
+                const index_t X                  = g_k_c_wis_lengths[I4];
+                g_k_c_wis_strides_transposed[I2] = 1;
+                g_k_c_wis_strides_transposed[I3] = X * C;
+                g_k_c_wis_strides_transposed[I4] = C;
+            }
+            else if constexpr(NDimSpatial == 3)
+            {
+                const index_t Y                  = g_k_c_wis_lengths[I4];
+                const index_t X                  = g_k_c_wis_lengths[I5];
+                g_k_c_wis_strides_transposed[I2] = 1;
+                g_k_c_wis_strides_transposed[I3] = Y * X * C;
+                g_k_c_wis_strides_transposed[I4] = X * C;
+                g_k_c_wis_strides_transposed[I5] = C;
+            }
+            return g_k_c_wis_strides_transposed;
+        }
+        else
+        {
+            // transpose not needed
+            return g_k_c_wis_strides;
+        }
+    }
 };
 
 } // namespace tensor_operation
diff --git a/include/ck/utility/amd_buffer_addressing.hpp b/include/ck/utility/amd_buffer_addressing.hpp
index 317f324e6d..62e3220b5a 100644
--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -581,7 +581,7 @@ __device__ void amd_global_atomic_add_impl(const typename vector_type<T, N>::typ
                                                       tmp.template AsType<half2_t>()[i]);
         });
     }
-#if defined(__gfx942__) || defined(__gfx950__)
+#if defined(__gfx942__) || defined(__gfx950__) || defined(__gfx12__)
     else if constexpr(is_same<T, bhalf_t>::value)
     {
         vector_type<bhalf_t, N> tmp{src_thread_data};
diff --git a/include/ck/utility/amd_buffer_addressing_builtins.hpp b/include/ck/utility/amd_buffer_addressing_builtins.hpp
new file mode 100644
index 0000000000..19869906dc
--- /dev/null
+++ b/include/ck/utility/amd_buffer_addressing_builtins.hpp
@@ -0,0 +1,886 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include "data_type.hpp"
+
+namespace ck {
+
+template <typename T>
+union BufferResource
+{
+    __device__ constexpr BufferResource() : content{} {}
+
+    // 128 bit SGPRs to supply buffer resource in buffer instructions
+    // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
+    int32x4_t content;
+    StaticallyIndexedArray<T*, 2> address;
+    StaticallyIndexedArray<int32_t, 4> range;
+    StaticallyIndexedArray<int32_t, 4> config;
+};
+
+template <typename T>
+__device__ int32x4_t make_wave_buffer_resource(T* p_wave, index_t element_space_size)
+{
+    BufferResource<T> wave_buffer_resource;
+
+    // wavewise base address (64 bit)
+    wave_buffer_resource.address(Number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave);
+    // wavewise range (32 bit)
+    wave_buffer_resource.range(Number<2>{}) = element_space_size * sizeof(T);
+    // wavewise setting (32 bit)
+    wave_buffer_resource.config(Number<3>{}) = CK_BUFFER_RESOURCE_3RD_DWORD;
+
+    return wave_buffer_resource.content;
+}
+
+template <typename T>
+__device__ int32x4_t make_wave_buffer_resource_with_default_range(T* p_wave)
+{
+    BufferResource<T> wave_buffer_resource;
+
+    // wavewise base address (64 bit)
+    wave_buffer_resource.address(Number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave);
+    // wavewise range (32 bit)
+    wave_buffer_resource.range(Number<2>{}) = 0xffffffff; // max possible range
+    // wavewise setting (32 bit)
+    wave_buffer_resource.config(Number<3>{}) = CK_BUFFER_RESOURCE_3RD_DWORD;
+
+    return wave_buffer_resource.content;
+}
+
+template <typename T>
+__device__ __amdgpu_buffer_rsrc_t make_wave_buffer_resource_new(T* p_wave,
+                                                                index_t element_space_size)
+{
+    // wavewise base address (64 bit)
+    auto p         = const_cast<remove_cv_t<T>*>(p_wave);
+    int32_t stride = 0;
+    int32_t num    = element_space_size * sizeof(T);
+    auto flags     = CK_BUFFER_RESOURCE_3RD_DWORD;
+
+    return __builtin_amdgcn_make_buffer_rsrc(p, stride, num, flags);
+}
+
+template <typename T>
+__device__ __amdgpu_buffer_rsrc_t make_wave_buffer_resource_with_default_range_new(T* p_wave)
+{
+    // wavewise base address (64 bit)
+    auto p         = const_cast<remove_cv_t<T>*>(p_wave);
+    int32_t stride = 0;
+    int32_t num    = 0xffffffff;
+    auto flags     = CK_BUFFER_RESOURCE_3RD_DWORD;
+
+    return __builtin_amdgcn_make_buffer_rsrc(p, stride, num, flags);
+}
+
+// buffer atomic-add fp16
+__device__ half2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(
+    half2_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16.v4i32");
+
+// buffer atomic-add i32
+__device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(
+    int32_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32.v4i32");
+
+// buffer atomic-add fp32
+__device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32(
+    float vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32.v4i32");
+
+// buffer atomic-add fp32
+__device__ double llvm_amdgcn_raw_buffer_atomic_max_fp64(
+    double vdata,
+    int32x4_t rsrc, // dst_wave_buffer_resource
+    int voffset,    // dst_thread_addr_offset
+    int soffset,    // dst_wave_addr_offset
+    int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64.v4i32");
+
+// memory coherency bit for buffer store/load instruction
+// check ISA manual for each GFX target
+// e.g. for
+// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
+// page 67~68
+enum struct AmdBufferCoherenceEnum
+{
+    DefaultCoherence = 0, // default value
+    GLC              = 1,
+    SLC              = 2,
+    GLC_SLC          = 3,
+    // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
+    // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
+    // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
+    WAVE_NT0   = 0,
+    WAVE_NT1   = 2,
+    GROUP_NT0  = 1,
+    GROUP_NT1  = 3,
+    DEVICE_NT0 = 8,
+    DEVICE_NT1 = 10,
+    SYSTEM_NT0 = 9,
+    SYSTEM_NT1 = 11,
+};
+
+template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
+__device__ typename vector_type<int8_t, N>::type
+amd_buffer_load_impl_raw(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
+                         index_t src_thread_addr_offset,
+                         index_t src_wave_addr_offset)
+{
+    static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
+                  "wrong! not implemented");
+
+    if constexpr(N == 1)
+    {
+        return __builtin_amdgcn_raw_buffer_load_b8(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence));
+    }
+    else if constexpr(N == 2)
+    {
+
+        int16_t tmp = __builtin_amdgcn_raw_buffer_load_b16(src_wave_buffer_resource,
+                                                           src_thread_addr_offset,
+                                                           src_wave_addr_offset,
+                                                           static_cast<index_t>(coherence));
+
+        return bit_cast<int8x2_t>(tmp);
+    }
+    else if constexpr(N == 4)
+    {
+        int32_t tmp = __builtin_amdgcn_raw_buffer_load_b32(src_wave_buffer_resource,
+                                                           src_thread_addr_offset,
+                                                           src_wave_addr_offset,
+                                                           static_cast<index_t>(coherence));
+
+        return bit_cast<int8x4_t>(tmp);
+    }
+    else if constexpr(N == 8)
+    {
+        int32x2_t tmp = __builtin_amdgcn_raw_buffer_load_b64(src_wave_buffer_resource,
+                                                             src_thread_addr_offset,
+                                                             src_wave_addr_offset,
+                                                             static_cast<index_t>(coherence));
+
+        return bit_cast<int8x8_t>(tmp);
+    }
+    else if constexpr(N == 16)
+    {
+        int32x4_t tmp = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
+                                                              src_thread_addr_offset,
+                                                              src_wave_addr_offset,
+                                                              static_cast<index_t>(coherence));
+        return bit_cast<int8x16_t>(tmp);
+    }
+    else if constexpr(N == 32)
+    {
+        int32x4_t tmp0 = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
+                                                               src_thread_addr_offset,
+                                                               src_wave_addr_offset,
+                                                               static_cast<index_t>(coherence));
+        int32x4_t tmp1 =
+            __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
+                                                  src_thread_addr_offset,
+                                                  src_wave_addr_offset + 4 * sizeof(int32_t),
+                                                  static_cast<index_t>(coherence));
+        vector_type<int32_t, 8> tmp;
+
+        tmp.AsType<int32x4_t>()(Number<0>{}) = tmp0;
+        tmp.AsType<int32x4_t>()(Number<1>{}) = tmp1;
+
+        return bit_cast<int8x32_t>(tmp);
+    }
+    else if constexpr(N == 64)
+    {
+        int32x4_t tmp0 = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
+                                                               src_thread_addr_offset,
+                                                               src_wave_addr_offset,
+                                                               static_cast<index_t>(coherence));
+        int32x4_t tmp1 =
+            __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
+                                                  src_thread_addr_offset,
+                                                  src_wave_addr_offset + 4 * sizeof(int32_t),
+                                                  static_cast<index_t>(coherence));
+        int32x4_t tmp2 =
+            __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
+                                                  src_thread_addr_offset,
+                                                  src_wave_addr_offset + 8 * sizeof(int32_t),
+                                                  static_cast<index_t>(coherence));
+        int32x4_t tmp3 =
+            __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
+                                                  src_thread_addr_offset,
+                                                  src_wave_addr_offset + 12 * sizeof(int32_t),
+                                                  static_cast<index_t>(coherence));
+
+        vector_type<int32_t, 16> tmp;
+
+        tmp.AsType<int32x4_t>()(Number<0>{}) = tmp0;
+        tmp.AsType<int32x4_t>()(Number<1>{}) = tmp1;
+        tmp.AsType<int32x4_t>()(Number<2>{}) = tmp2;
+        tmp.AsType<int32x4_t>()(Number<3>{}) = tmp3;
+
+        return bit_cast<int8x64_t>(tmp);
+    }
+}
+
+template <typename T,
+          index_t N,
+          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
+__device__ typename vector_type<T, N>::type
+amd_buffer_load_impl(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
+                     index_t src_thread_addr_offset,
+                     index_t src_wave_addr_offset)
+{
+    static_assert(
+        (is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
+            (is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, bhalf_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, f8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, uint8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, pk_i4_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
+        "wrong! not implemented");
+
+    using r_t     = typename vector_type<T, N>::type;
+    auto raw_data = amd_buffer_load_impl_raw<sizeof(T) * N, coherence>(
+        src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset);
+    return bit_cast<r_t>(raw_data);
+}
+
+template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
+__device__ void
+amd_buffer_store_impl_raw(const typename vector_type<int8_t, N>::type src_thread_data,
+                          __amdgpu_buffer_rsrc_t dst_wave_buffer_resource,
+                          index_t dst_thread_addr_offset,
+                          index_t dst_wave_addr_offset)
+{
+    static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
+                  "wrong! not implemented");
+
+    if constexpr(N == 1)
+    {
+        __builtin_amdgcn_raw_buffer_store_b8(src_thread_data,
+                                             dst_wave_buffer_resource,
+                                             dst_thread_addr_offset,
+                                             dst_wave_addr_offset,
+                                             static_cast<index_t>(coherence));
+    }
+    else if constexpr(N == 2)
+    {
+
+        __builtin_amdgcn_raw_buffer_store_b16(bit_cast<int16_t>(src_thread_data),
+                                              dst_wave_buffer_resource,
+                                              dst_thread_addr_offset,
+                                              dst_wave_addr_offset,
+                                              static_cast<index_t>(coherence));
+    }
+    else if constexpr(N == 4)
+    {
+        __builtin_amdgcn_raw_buffer_store_b32(bit_cast<int32_t>(src_thread_data),
+                                              dst_wave_buffer_resource,
+                                              dst_thread_addr_offset,
+                                              dst_wave_addr_offset,
+                                              static_cast<index_t>(coherence));
+    }
+    else if constexpr(N == 8)
+    {
+        __builtin_amdgcn_raw_buffer_store_b64(bit_cast<int32x2_t>(src_thread_data),
+                                              dst_wave_buffer_resource,
+                                              dst_thread_addr_offset,
+                                              dst_wave_addr_offset,
+                                              static_cast<index_t>(coherence));
+    }
+    else if constexpr(N == 16)
+    {
+        __builtin_amdgcn_raw_buffer_store_b128(bit_cast<int32x4_t>(src_thread_data),
+                                               dst_wave_buffer_resource,
+                                               dst_thread_addr_offset,
+                                               dst_wave_addr_offset,
+                                               static_cast<index_t>(coherence));
+    }
+    else if constexpr(N == 32)
+    {
+        vector_type<int32_t, 8> tmp{bit_cast<int32x8_t>(src_thread_data)};
+
+        __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[Number<0>{}],
+                                               dst_wave_buffer_resource,
+                                               dst_thread_addr_offset,
+                                               dst_wave_addr_offset,
+                                               static_cast<index_t>(coherence));
+
+        __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[Number<1>{}],
+                                               dst_wave_buffer_resource,
+                                               dst_thread_addr_offset,
+                                               dst_wave_addr_offset + sizeof(int32_t) * 4,
+                                               static_cast<index_t>(coherence));
+    }
+    else if constexpr(N == 64)
+    {
+        vector_type<int32_t, 16> tmp{bit_cast<int32x16_t>(src_thread_data)};
+
+        __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[Number<0>{}],
+                                               dst_wave_buffer_resource,
+                                               dst_thread_addr_offset,
+                                               dst_wave_addr_offset,
+                                               static_cast<index_t>(coherence));
+
+        __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[Number<1>{}],
+                                               dst_wave_buffer_resource,
+                                               dst_thread_addr_offset,
+                                               dst_wave_addr_offset + sizeof(int32_t) * 4,
+                                               static_cast<index_t>(coherence));
+
+        __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[Number<2>{}],
+                                               dst_wave_buffer_resource,
+                                               dst_thread_addr_offset,
+                                               dst_wave_addr_offset + sizeof(int32_t) * 8,
+                                               static_cast<index_t>(coherence));
+
+        __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[Number<3>{}],
+                                               dst_wave_buffer_resource,
+                                               dst_thread_addr_offset,
+                                               dst_wave_addr_offset + sizeof(int32_t) * 12,
+                                               static_cast<index_t>(coherence));
+    }
+}
+
+template <typename T,
+          index_t N,
+          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
+__device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src_thread_data,
+                                      __amdgpu_buffer_rsrc_t dst_wave_buffer_resource,
+                                      index_t dst_thread_addr_offset,
+                                      index_t dst_wave_addr_offset)
+{
+    static_assert(
+        (is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
+            (is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, bhalf_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, f8_fnuz_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, bf8_fnuz_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, fp8_storage_t>::value &&
+             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
+        "wrong! not implemented");
+
+    using r_t = typename vector_type<int8_t, sizeof(T) * N>::type;
+
+    amd_buffer_store_impl_raw<sizeof(T) * N, coherence>(bit_cast<r_t>(src_thread_data),
+                                                        dst_wave_buffer_resource,
+                                                        dst_thread_addr_offset,
+                                                        dst_wave_addr_offset);
+}
+
+template <typename T, index_t N>
+__device__ void amd_global_atomic_add_impl(const typename vector_type<T, N>::type src_thread_data,
+                                           T* addr)
+{
+    static_assert((is_same<T, bhalf_t>::value && (N == 2 || N == 4 || N == 8)) ||
+                      (is_same<T, half_t>::value && (N == 2 || N == 4 || N == 8)),
+                  "wrong! not implemented");
+
+    if constexpr(is_same<T, half_t>::value)
+    {
+        vector_type<half_t, N> tmp{src_thread_data};
+        static_for<0, N / 2, 1>{}([&](auto i) {
+            __builtin_amdgcn_global_atomic_fadd_v2f16(bit_cast<half2_t*>(addr) + i,
+                                                      tmp.template AsType<half2_t>()[i]);
+        });
+    }
+#if defined(__gfx942__) || defined(__gfx950__)
+    else if constexpr(is_same<T, bhalf_t>::value)
+    {
+        vector_type<bhalf_t, N> tmp{src_thread_data};
+        static_for<0, N / 2, 1>{}([&](auto i) {
+            __builtin_amdgcn_global_atomic_fadd_v2bf16(bit_cast<bhalf2_t*>(addr) + i,
+                                                       tmp.template AsType<bhalf2_t>()[i]);
+        });
+    }
+#endif
+}
+
+template <typename T, index_t N>
+__device__ void amd_buffer_atomic_add_impl(const typename vector_type<T, N>::type src_thread_data,
+                                           int32x4_t dst_wave_buffer_resource,
+                                           index_t dst_thread_addr_offset,
+                                           index_t dst_wave_addr_offset)
+{
+    static_assert((is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
+                      (is_same<T, half_t>::value && (N == 2 || N == 4 || N == 8)) ||
+                      (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)),
+                  "wrong! not implemented");
+
+    if constexpr(is_same<T, float>::value)
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(src_thread_data,
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset,
+                                                   0);
+        }
+        else if constexpr(N == 2)
+        {
+            vector_type<float, 2> tmp{src_thread_data};
+
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<0>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset,
+                                                   0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<1>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset + sizeof(float),
+                                                   0);
+        }
+        else if constexpr(N == 4)
+        {
+            vector_type<float, 4> tmp{src_thread_data};
+
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<0>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset,
+                                                   0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<1>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset + sizeof(float),
+                                                   0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<2>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset + 2 * sizeof(float),
+                                                   0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<3>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset + 3 * sizeof(float),
+                                                   0);
+        }
+    }
+    else if constexpr(is_same<T, half_t>::value)
+    {
+        if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_atomic_add_fp16x2(src_thread_data,
+                                                     dst_wave_buffer_resource,
+                                                     dst_thread_addr_offset,
+                                                     dst_wave_addr_offset,
+                                                     0);
+        }
+        else if constexpr(N == 4)
+        {
+            vector_type<half_t, 4> tmp{src_thread_data};
+
+            static_for<0, 2, 1>{}([&](auto i) {
+                llvm_amdgcn_raw_buffer_atomic_add_fp16x2(tmp.AsType<half2_t>()[i],
+                                                         dst_wave_buffer_resource,
+                                                         dst_thread_addr_offset,
+                                                         dst_wave_addr_offset + i * sizeof(half2_t),
+                                                         0);
+            });
+        }
+        else if constexpr(N == 8)
+        {
+            vector_type<half_t, 8> tmp{src_thread_data};
+
+            static_for<0, 4, 1>{}([&](auto i) {
+                llvm_amdgcn_raw_buffer_atomic_add_fp16x2(tmp.AsType<half2_t>()[i],
+                                                         dst_wave_buffer_resource,
+                                                         dst_thread_addr_offset,
+                                                         dst_wave_addr_offset + i * sizeof(half2_t),
+                                                         0);
+            });
+        }
+    }
+    else if constexpr(is_same<T, int32_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_atomic_add_i32(src_thread_data,
+                                                  dst_wave_buffer_resource,
+                                                  dst_thread_addr_offset,
+                                                  dst_wave_addr_offset,
+                                                  0);
+        }
+        else if constexpr(N == 2)
+        {
+            vector_type<int32_t, 2> tmp{src_thread_data};
+
+            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<0>{}],
+                                                  dst_wave_buffer_resource,
+                                                  dst_thread_addr_offset,
+                                                  dst_wave_addr_offset,
+                                                  0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<1>{}],
+                                                  dst_wave_buffer_resource,
+                                                  dst_thread_addr_offset,
+                                                  dst_wave_addr_offset + sizeof(int32_t),
+                                                  0);
+        }
+        else if constexpr(N == 4)
+        {
+            vector_type<int32_t, 4> tmp{src_thread_data};
+
+            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<0>{}],
+                                                  dst_wave_buffer_resource,
+                                                  dst_thread_addr_offset,
+                                                  dst_wave_addr_offset,
+                                                  0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<1>{}],
+                                                  dst_wave_buffer_resource,
+                                                  dst_thread_addr_offset,
+                                                  dst_wave_addr_offset + sizeof(int32_t),
+                                                  0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<2>{}],
+                                                  dst_wave_buffer_resource,
+                                                  dst_thread_addr_offset,
+                                                  dst_wave_addr_offset + 2 * sizeof(int32_t),
+                                                  0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<3>{}],
+                                                  dst_wave_buffer_resource,
+                                                  dst_thread_addr_offset,
+                                                  dst_wave_addr_offset + 3 * sizeof(int32_t),
+                                                  0);
+        }
+    }
+}
+
+template <typename T, index_t N>
+__device__ void amd_buffer_atomic_max_impl(const typename vector_type<T, N>::type src_thread_data,
+                                           int32x4_t dst_wave_buffer_resource,
+                                           index_t dst_thread_addr_offset,
+                                           index_t dst_wave_addr_offset)
+{
+    static_assert((is_same<T, double>::value && (N == 1 || N == 2 || N == 4)),
+                  "wrong! not implemented");
+    if constexpr(is_same<T, double>::value)
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(src_thread_data,
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset,
+                                                   0);
+        }
+        else if constexpr(N == 2)
+        {
+            vector_type<double, 2> tmp{src_thread_data};
+
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<0>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset,
+                                                   0);
+
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<1>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset + sizeof(double),
+                                                   0);
+        }
+        else if constexpr(N == 4)
+        {
+            vector_type<double, 4> tmp{src_thread_data};
+
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<0>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset,
+                                                   0);
+
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<1>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset + sizeof(double),
+                                                   0);
+
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<2>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset + 2 * sizeof(double),
+                                                   0);
+
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<3>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset + 3 * sizeof(double),
+                                                   0);
+        }
+    }
+}
+
+// buffer_load requires:
+//   1) p_src_wave must point to global memory space
+//   2) p_src_wave must be a wavewise pointer.
+// It is user's responsibility to make sure that is true.
+template <typename T,
+          index_t N,
+          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
+__device__ typename vector_type_maker<T, N>::type::type
+amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,
+                                            index_t src_thread_element_offset,
+                                            bool src_thread_element_valid,
+                                            index_t src_element_space_size)
+{
+    const __amdgpu_buffer_rsrc_t src_wave_buffer_resource =
+        make_wave_buffer_resource_new(p_src_wave, src_element_space_size);
+
+    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+
+    using vector_t = typename vector_type_maker<T, N>::type::type;
+    using scalar_t = typename scalar_type<vector_t>::type;
+
+    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
+
+#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
+    uint32_t src_addr_shift = src_thread_element_valid ? 0 : 0x80000000;
+    return amd_buffer_load_impl<scalar_t, vector_size, coherence>(
+        src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
+
+#else
+
+    vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>(
+        src_wave_buffer_resource, src_thread_addr_offset, 0)};
+    return src_thread_element_valid ? tmp : vector_t(0);
+#endif
+}
+
+// buffer_load requires:
+//   1) p_src_wave must point to global memory space
+//   2) p_src_wave must be a wavewise pointer.
+// It is user's responsibility to make sure that is true.
+template <typename T,
+          index_t N,
+          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
+__device__ typename vector_type_maker<T, N>::type::type
+amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,
+                                                        index_t src_thread_element_offset,
+                                                        bool src_thread_element_valid,
+                                                        index_t src_element_space_size,
+                                                        T customized_value)
+{
+    const __amdgpu_buffer_rsrc_t src_wave_buffer_resource =
+        make_wave_buffer_resource_new(p_src_wave, src_element_space_size);
+
+    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+
+    using vector_t = typename vector_type_maker<T, N>::type::type;
+    using scalar_t = typename scalar_type<vector_t>::type;
+
+    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
+
+    vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>(
+        src_wave_buffer_resource, src_thread_addr_offset, 0)};
+
+    return src_thread_element_valid ? tmp : vector_t(customized_value);
+}
+
+// buffer_store requires:
+//   1) p_dst_wave must point to global memory
+//   2) p_dst_wave must be a wavewise pointer.
+// It is user's responsibility to make sure that is true.
+template <typename T,
+          index_t N,
+          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
+__device__ void amd_buffer_store(const typename vector_type_maker<T, N>::type::type src_thread_data,
+                                 T* p_dst_wave,
+                                 const index_t dst_thread_element_offset,
+                                 const bool dst_thread_element_valid,
+                                 const index_t dst_element_space_size)
+{
+    const __amdgpu_buffer_rsrc_t dst_wave_buffer_resource =
+        make_wave_buffer_resource_new(p_dst_wave, dst_element_space_size);
+
+    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
+
+    using vector_t                = typename vector_type_maker<T, N>::type::type;
+    using scalar_t                = typename scalar_type<vector_t>::type;
+    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
+
+#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
+    uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
+    amd_buffer_store_impl<scalar_t, vector_size, coherence>(
+        src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
+#else
+    if(dst_thread_element_valid)
+    {
+        amd_buffer_store_impl<scalar_t, vector_size, coherence>(
+            src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
+    }
+#endif
+}
+
+// buffer_atomic_add requires:
+//   1) p_dst_wave must point to global memory
+//   2) p_dst_wave must be a wavewise pointer.
+// It is user's responsibility to make sure that is true.
+template <typename T, index_t N>
+__device__ void
+amd_buffer_atomic_add(const typename vector_type_maker<T, N>::type::type src_thread_data,
+                      T* p_dst_wave,
+                      const index_t dst_thread_element_offset,
+                      const bool dst_thread_element_valid,
+                      const index_t dst_element_space_size)
+{
+    const int32x4_t dst_wave_buffer_resource =
+        make_wave_buffer_resource(p_dst_wave, dst_element_space_size);
+
+    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
+
+    using vector_t                = typename vector_type_maker<T, N>::type::type;
+    using scalar_t                = typename scalar_type<vector_t>::type;
+    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
+
+    if constexpr(is_same<T, bhalf_t>::value)
+    {
+        if(dst_thread_element_valid)
+        {
+            amd_global_atomic_add_impl<scalar_t, vector_size>(
+                src_thread_data, p_dst_wave + dst_thread_element_offset);
+        }
+    }
+    else
+    {
+#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
+        uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
+
+        amd_buffer_atomic_add_impl<scalar_t, vector_size>(
+            src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
+#else
+        if(dst_thread_element_valid)
+        {
+            amd_buffer_atomic_add_impl<scalar_t, vector_size>(
+                src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
+        }
+#endif
+    }
+}
+
+// buffer_atomic_max requires:
+//   1) p_dst_wave must point to global memory
+//   2) p_dst_wave must be a wavewise pointer.
+// It is user's responsibility to make sure that is true.
+template <typename T, index_t N>
+__device__ void
+amd_buffer_atomic_max(const typename vector_type_maker<T, N>::type::type src_thread_data,
+                      T* p_dst_wave,
+                      const index_t dst_thread_element_offset,
+                      const bool dst_thread_element_valid,
+                      const index_t dst_element_space_size)
+{
+    const int32x4_t dst_wave_buffer_resource =
+        make_wave_buffer_resource(p_dst_wave, dst_element_space_size);
+
+    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
+
+    using vector_t                = typename vector_type_maker<T, N>::type::type;
+    using scalar_t                = typename scalar_type<vector_t>::type;
+    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
+
+#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK
+    uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
+
+    amd_buffer_atomic_max_impl<scalar_t, vector_size>(
+        src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
+#else
+    if(dst_thread_element_valid)
+    {
+        amd_buffer_atomic_max_impl<scalar_t, vector_size>(
+            src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
+    }
+#endif
+}
+
+// Direct loads from global to LDS.
+__device__ void
+llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc,
+                                __attribute__((address_space(3))) uint32_t* lds_ptr,
+                                index_t size,
+                                index_t voffset,
+                                index_t soffset,
+                                index_t offset,
+                                index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds.v4i32");
+
+#ifndef __HIPCC_RTC__
+template <typename T, index_t NumElemsPerThread>
+__device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
+                                              const index_t global_offset,
+                                              T* lds_base_ptr,
+                                              const index_t lds_offset,
+                                              const bool is_valid,
+                                              const index_t src_element_space_size)
+{
+    // Direct loads require that each thread reads and writes exactly a single DWORD.
+    constexpr auto dword_bytes      = 4;
+    constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
+    static_assert(bytes_per_thread == dword_bytes);
+
+#ifndef CK_CODE_GEN_RTC
+    const uint32_t* global_ptr =
+        reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(global_base_ptr));
+#else
+    const uint32_t* global_ptr =
+        reinterpret_cast<uint32_t*>(reinterpret_cast<size_t>(global_base_ptr));
+#endif
+    const int32x4_t src_resource = make_wave_buffer_resource(global_ptr, src_element_space_size);
+    const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000;
+
+#if CK_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
+    T* lds_ptr = lds_base_ptr + lds_offset;
+#ifndef CK_CODE_GEN_RTC
+    auto const lds_ptr_sgpr =
+        __builtin_amdgcn_readfirstlane((reinterpret_cast<uintptr_t>(lds_ptr)));
+#else
+    auto const lds_ptr_sgpr = __builtin_amdgcn_readfirstlane((reinterpret_cast<size_t>(lds_ptr)));
+#endif
+    asm volatile("s_mov_b32 m0, %0; \n\t"
+                 "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr),
+                 "v"(global_offset_bytes),
+                 "s"(src_resource)
+                 : "memory");
+#else
+    // LDS pointer must be attributed with the LDS address space.
+    __attribute__((address_space(3))) uint32_t* lds_ptr =
+#ifndef CK_CODE_GEN_RTC
+        reinterpret_cast<__attribute__((address_space(3))) uint32_t*>(
+            reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
+#else
+        reinterpret_cast<__attribute__((address_space(3))) uint32_t*>(
+            reinterpret_cast<size_t>(lds_base_ptr + lds_offset));
+#endif
+
+    llvm_amdgcn_raw_buffer_load_lds(
+        src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0);
+#endif
+}
+#endif
+
+} // namespace ck
diff --git a/include/ck/utility/amd_ck_fp8.hpp b/include/ck/utility/amd_ck_fp8.hpp
index 42b784d303..d079639c6a 100644
--- a/include/ck/utility/amd_ck_fp8.hpp
+++ b/include/ck/utility/amd_ck_fp8.hpp
@@ -6,6 +6,7 @@
 #include "ck/ck.hpp"
 #include "ck/utility/enable_if.hpp"
 #include "ck/utility/random_gen.hpp"
+#include "ck/utility/functional.hpp"
 #include "ck/utility/type.hpp"
 
 #ifdef CK_USE_FNUZ_FP8
@@ -20,8 +21,7 @@
 #define CK_USE_OCP_FP8 0
 #endif
 
-#if(defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx1200__) || \
-    defined(__gfx1201__) || defined(__gfx950__)) &&                                              \
+#if(defined(__gfx942__) || defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx950__)) && \
     __HIP_DEVICE_COMPILE__
 #define CK_FP8_CVT_FAST_PATH 1
 #else
@@ -64,6 +64,9 @@ enum class ck_saturation_t
 namespace fp8_impl {
 
 typedef fp8_storage_t fp8x2_storage_t __attribute__((ext_vector_type(2)));
+typedef _Float16 half2_t __attribute__((ext_vector_type(2)));
+typedef ushort ushortx2_t __attribute__((ext_vector_type(2)));
+typedef short shortx2_t __attribute__((ext_vector_type(2)));
 typedef float float2_t __attribute__((ext_vector_type(2)));
 
 __host__ __device__ static inline constexpr bool fnuz_f8_is_nan(f8_fnuz_t a)
@@ -193,10 +196,10 @@ __host__ __device__ static inline T cast_from_f8(fp8_storage_t x)
         }
     }
 
-    typename std::conditional<
+    typename ck::conditional_t<
         sizeof(T) == 2,
         unsigned short int,
-        typename std::conditional<sizeof(T) == 4, unsigned int, unsigned long long>::type>::type
+        typename ck::conditional_t<sizeof(T) == 4, unsigned int, unsigned long long>>
         retval;
 
     if constexpr(we == 5 && is_half && !is_fnuz)
@@ -243,7 +246,7 @@ __host__ __device__ static inline T cast_from_f8(fp8_storage_t x)
 
 #if CK_FP8_CVT_FAST_PATH
 template <ck_fp8_interpretation_t interpret>
-static __device__ float cast_to_f32_from_f8(fp8_storage_t v)
+static __host__ __device__ float cast_to_f32_from_f8(fp8_storage_t v)
 {
     union
     {
@@ -270,7 +273,7 @@ static __device__ float cast_to_f32_from_f8(fp8_storage_t v)
 }
 
 template <ck_fp8_interpretation_t interpret>
-static __device__ float2_t cast_to_f32x2_from_f8x2(fp8x2_storage_t v)
+static __device__ float2_t cast_to_f32_from_f8(fp8x2_storage_t v)
 {
     const auto i16val = bit_cast<uint16_t>(v);
 
@@ -458,6 +461,510 @@ __is_interpret_supported([[maybe_unused]] ck_fp8_interpretation_t interp)
 #endif
 }
 
+#if defined(__gfx950__)
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E4M3_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == true, bool>                       = false>
+static __device__ fp8_storage_t cast_to_f8_from_f16(_Float16 v, unsigned int rng = 0)
+{
+    union
+    {
+        unsigned int i32val;
+        half2_t half_vec;
+        fp8_storage_t i8val[4];
+    } val;
+
+    constexpr unsigned int i32val = 0;
+    val.half_vec[0]               = v;
+
+    if constexpr(saturate)
+    {
+        if((val.i32val & 0x7FFF) != 0x7FFF)
+        {
+            val.half_vec[0] = __builtin_amdgcn_fmed3h(val.half_vec[0], 448.0, -448.0);
+        }
+    }
+
+    val.i32val =
+        __builtin_amdgcn_cvt_scalef32_sr_fp8_f16(i32val, val.half_vec[0], rng, /* scale */ 1.f, 0);
+
+    return val.i8val[0];
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E4M3_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == true, bool>                       = false>
+static __device__ fp8x2_storage_t cast_to_f8_from_f16(half2_t v, unsigned int rng = 0)
+{
+    // there is no packed conversion with SR, so convert one element at a time
+    return fp8x2_storage_t{
+        cast_to_f8_from_f16<interpret, saturate, stochastic_rounding>(v[0], rng),
+        cast_to_f8_from_f16<interpret, saturate, stochastic_rounding>(v[1], rng)};
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E5M2_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == true, bool>                       = false>
+static __device__ fp8_storage_t cast_to_f8_from_f16(_Float16 v, unsigned int rng = 0)
+{
+    union
+    {
+        unsigned int i32val;
+        half2_t half_vec;
+        fp8_storage_t i8val[4];
+    } val;
+
+    constexpr unsigned int i32val = 0;
+    val.half_vec[0]               = v;
+
+    if constexpr(saturate)
+    {
+        if((val.i32val & 0x7FFF) != 0x7FFF)
+        {
+            val.half_vec[0] = __builtin_amdgcn_fmed3h(val.half_vec[0], 57344.0, -57344.0);
+        }
+    }
+
+    val.i32val =
+        __builtin_amdgcn_cvt_scalef32_sr_bf8_f16(i32val, val.half_vec[0], rng, /* scale */ 1.f, 0);
+
+    return val.i8val[0];
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E5M2_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == true, bool>                       = false>
+static __device__ fp8x2_storage_t cast_to_f8_from_f16(half2_t v, unsigned int rng = 0)
+{
+    // there is no packed conversion with SR, so convert one element at a time
+    return fp8x2_storage_t{
+        cast_to_f8_from_f16<interpret, saturate, stochastic_rounding>(v[0], rng),
+        cast_to_f8_from_f16<interpret, saturate, stochastic_rounding>(v[1], rng)};
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E4M3_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == false, bool>                      = false>
+static __device__ fp8_storage_t cast_to_f8_from_f16(_Float16 v, unsigned int rng = 0)
+{
+    ignore = rng;
+
+    union
+    {
+        unsigned int i32val;
+        half2_t half_vec;
+        shortx2_t i16_vec;
+        fp8_storage_t i8val[4];
+    } val;
+
+    constexpr shortx2_t i16x2val = {0, 0};
+    val.half_vec[0]              = v;
+
+    if constexpr(saturate)
+    {
+        if((val.i32val & 0x7FFF) != 0x7FFF)
+        {
+            val.half_vec[0] = __builtin_amdgcn_fmed3h(val.half_vec[0], 448.0, -448.0);
+        }
+    }
+
+    val.i16_vec =
+        __builtin_amdgcn_cvt_scalef32_pk_fp8_f16(i16x2val, val.half_vec, /* scale */ 1.f, 0);
+
+    return val.i8val[0];
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E4M3_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == false, bool>                      = false>
+static __device__ fp8x2_storage_t cast_to_f8_from_f16(half2_t v, unsigned int rng = 0)
+{
+#if CK_WORKAROUND_FP16_TO_FP8_CONVERSION
+    return fp8x2_storage_t{
+        cast_to_f8_from_f16<interpret, saturate, stochastic_rounding>(v[0], rng),
+        cast_to_f8_from_f16<interpret, saturate, stochastic_rounding>(v[1], rng)};
+#else
+    ignore = rng;
+
+    union
+    {
+        half2_t half_vec;
+        shortx2_t i16_vec;
+        fp8_storage_t i8val[4];
+    } val;
+
+    constexpr shortx2_t i16x2val = {0, 0};
+    val.half_vec                 = v;
+
+    if constexpr(saturate)
+    {
+        if((val.i16_vec[0] & 0x7FFF) != 0x7FFF)
+        {
+            val.half_vec[0] = __builtin_amdgcn_fmed3h(val.half_vec[0], 448.0, -448.0);
+        }
+        if((val.i16_vec[1] & 0x7FFF) != 0x7FFF)
+        {
+            val.half_vec[1] = __builtin_amdgcn_fmed3h(val.half_vec[1], 448.0, -448.0);
+        }
+    }
+
+    val.i16_vec =
+        __builtin_amdgcn_cvt_scalef32_pk_fp8_f16(i16x2val, val.half_vec, /* scale */ 1.f, 0);
+
+    return fp8x2_storage_t{val.i8val[0], val.i8val[1]};
+#endif
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E5M2_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == false, bool>                      = false>
+static __device__ fp8_storage_t cast_to_f8_from_f16(_Float16 v, unsigned int rng = 0)
+{
+    ignore = rng;
+
+    union
+    {
+        unsigned int i32val;
+        half2_t half_vec;
+        shortx2_t i16_vec;
+        fp8_storage_t i8val[4];
+    } val;
+
+    constexpr shortx2_t i16x2val = {0, 0};
+    val.half_vec[0]              = v;
+
+    if constexpr(saturate)
+    {
+        if((val.i32val & 0x7FFF) != 0x7FFF)
+        {
+            val.half_vec[0] = __builtin_amdgcn_fmed3h(val.half_vec[0], 57344.0, -57344.0);
+        }
+    }
+
+    val.half_vec =
+        __builtin_amdgcn_cvt_scalef32_pk_bf8_f16(i16x2val, val.half_vec, /* scale */ 1.f, 0);
+
+    return val.i8val[0];
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E5M2_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == false, bool>                      = false>
+static __device__ fp8x2_storage_t cast_to_f8_from_f16(half2_t v, unsigned int rng = 0)
+{
+#if CK_WORKAROUND_FP16_TO_FP8_CONVERSION
+    return fp8x2_storage_t{
+        cast_to_f8_from_f16<interpret, saturate, stochastic_rounding>(v[0], rng),
+        cast_to_f8_from_f16<interpret, saturate, stochastic_rounding>(v[1], rng)};
+#else
+    ignore = rng;
+
+    union
+    {
+        half2_t half_vec;
+        shortx2_t i16_vec;
+        fp8_storage_t i8val[4];
+    } val;
+
+    constexpr shortx2_t i16x2val = {0, 0};
+    val.half_vec                 = v;
+
+    if constexpr(saturate)
+    {
+        if((val.i16_vec[0] & 0x7FFF) != 0x7FFF)
+        {
+            val.half_vec[0] = __builtin_amdgcn_fmed3h(val.half_vec[0], 57344.0, -57344.0);
+        }
+        if((val.i16_vec[1] & 0x7FFF) != 0x7FFF)
+        {
+            val.half_vec[1] = __builtin_amdgcn_fmed3h(val.half_vec[1], 57344.0, -57344.0);
+        }
+    }
+
+    val.i16_vec =
+        __builtin_amdgcn_cvt_scalef32_pk_bf8_f16(i16x2val, val.half_vec, /* scale */ 1.f, 0);
+
+    return fp8x2_storage_t{val.i8val[0], val.i8val[1]};
+#endif
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E4M3_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == true, bool>                       = false>
+static __device__ fp8_storage_t cast_to_f8_from_bf16(ushort v, unsigned int rng = 0)
+{
+    union
+    {
+        unsigned int i32val;
+        ushortx2_t bhalf_vec;
+        fp8_storage_t i8val[4];
+    } val;
+
+    constexpr unsigned int i32val = 0;
+    val.bhalf_vec[0]              = v;
+
+    if constexpr(saturate)
+    {
+        if((val.i32val & 0x7FFF) != 0x7FFF)
+        {
+            val.bhalf_vec[0] =
+                ushort((bit_cast<uint32_t>(__builtin_amdgcn_fmed3f(
+                            bit_cast<float>(uint32_t{val.bhalf_vec[0]} << 16), 448.0, -448.0)) >>
+                        16)); // convert to float and back
+        }
+    }
+
+    val.i32val = __builtin_amdgcn_cvt_scalef32_sr_fp8_bf16(
+        i32val, val.bhalf_vec[0], rng, /* scale */ 1.f, 0);
+
+    return val.i8val[0];
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E4M3_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == true, bool>                       = false>
+static __device__ fp8x2_storage_t cast_to_f8_from_bf16(ushortx2_t v, unsigned int rng = 0)
+{
+    // there is no packed conversion with SR, so convert one element at a time
+    return fp8x2_storage_t{
+        cast_to_f8_from_bf16<interpret, saturate, stochastic_rounding>(v[0], rng),
+        cast_to_f8_from_bf16<interpret, saturate, stochastic_rounding>(v[1], rng)};
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E5M2_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == true, bool>                       = false>
+static __device__ fp8_storage_t cast_to_f8_from_bf16(ushort v, unsigned int rng = 0)
+{
+    union
+    {
+        unsigned int i32val;
+        ushortx2_t bhalf_vec;
+        fp8_storage_t i8val[4];
+    } val;
+
+    constexpr unsigned int i32val = 0;
+    val.bhalf_vec[0]              = v;
+
+    if constexpr(saturate)
+    {
+        if((val.i32val & 0x7FFF) != 0x7FFF)
+        {
+            val.bhalf_vec[0] = ushort(
+                (bit_cast<uint32_t>(__builtin_amdgcn_fmed3f(
+                     bit_cast<float>(uint32_t{val.bhalf_vec[0]} << 16), 57344.0, -57344.0)) >>
+                 16)); // convert to float and back
+        }
+    }
+
+    val.i32val = __builtin_amdgcn_cvt_scalef32_sr_bf8_bf16(
+        i32val, val.bhalf_vec[0], rng, /* scale */ 1.f, 0);
+
+    return val.i8val[0];
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E5M2_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == true, bool>                       = false>
+static __device__ fp8x2_storage_t cast_to_f8_from_bf16(ushortx2_t v, unsigned int rng = 0)
+{
+    // there is no packed conversion with SR, so convert one element at a time
+    return fp8x2_storage_t{
+        cast_to_f8_from_bf16<interpret, saturate, stochastic_rounding>(v[0], rng),
+        cast_to_f8_from_bf16<interpret, saturate, stochastic_rounding>(v[1], rng)};
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E4M3_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == false, bool>                      = false>
+static __device__ fp8_storage_t cast_to_f8_from_bf16(ushort v, unsigned int rng = 0)
+{
+    ignore = rng;
+
+    union
+    {
+        unsigned int i32val;
+        ushortx2_t bhalf_vec;
+        shortx2_t i16_vec;
+        fp8_storage_t i8val[4];
+    } val;
+
+    constexpr shortx2_t i16x2val = {0, 0};
+    val.bhalf_vec[0]             = v;
+
+    if constexpr(saturate)
+    {
+        if((val.i32val & 0x7FFF) != 0x7FFF)
+        {
+            val.bhalf_vec[0] =
+                ushort((bit_cast<uint32_t>(__builtin_amdgcn_fmed3f(
+                            bit_cast<float>(uint32_t{val.bhalf_vec[0]} << 16), 448.0, -448.0)) >>
+                        16)); // convert to float and back
+        }
+    }
+
+    val.i16_vec =
+        __builtin_amdgcn_cvt_scalef32_pk_fp8_bf16(i16x2val, val.bhalf_vec, /* scale */ 1.f, 0);
+
+    return val.i8val[0];
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E4M3_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == false, bool>                      = false>
+static __device__ fp8x2_storage_t cast_to_f8_from_bf16(ushortx2_t v, unsigned int rng = 0)
+{
+#if CK_WORKAROUND_BF16_TO_FP8_CONVERSION
+    return fp8x2_storage_t{
+        cast_to_f8_from_bf16<interpret, saturate, stochastic_rounding>(v[0], rng),
+        cast_to_f8_from_bf16<interpret, saturate, stochastic_rounding>(v[1], rng)};
+#else
+    ignore = rng;
+
+    union
+    {
+        ushortx2_t bhalf_vec;
+        shortx2_t i16_vec;
+        fp8_storage_t i8val[4];
+    } val;
+
+    constexpr shortx2_t i16x2val = {0, 0};
+    val.bhalf_vec                = v;
+
+    if constexpr(saturate)
+    {
+        if((val.i16_vec[0] & 0x7FFF) != 0x7FFF)
+        {
+            val.bhalf_vec[0] =
+                ushort((bit_cast<uint32_t>(__builtin_amdgcn_fmed3f(
+                            bit_cast<float>(uint32_t{val.bhalf_vec[0]} << 16), 448.0, -448.0)) >>
+                        16)); // convert to float and back
+        }
+        if((val.i16_vec[1] & 0x7FFF) != 0x7FFF)
+        {
+            val.bhalf_vec[1] =
+                ushort((bit_cast<uint32_t>(__builtin_amdgcn_fmed3f(
+                            bit_cast<float>(uint32_t{val.bhalf_vec[1]} << 16), 448.0, -448.0)) >>
+                        16)); // convert to float and back
+        }
+    }
+
+    val.i16_vec =
+        __builtin_amdgcn_cvt_scalef32_pk_fp8_bf16(i16x2val, val.bhalf_vec, /* scale */ 1.f, 0);
+
+    return fp8x2_storage_t{val.i8val[0], val.i8val[1]};
+#endif
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E5M2_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == false, bool>                      = false>
+static __device__ fp8_storage_t cast_to_f8_from_bf16(ushort v, unsigned int rng = 0)
+{
+    ignore = rng;
+
+    union
+    {
+        unsigned int i32val;
+        ushortx2_t bhalf_vec;
+        shortx2_t i16_vec;
+        fp8_storage_t i8val[4];
+    } val;
+
+    constexpr shortx2_t i16x2val = {0, 0};
+    val.bhalf_vec[0]             = v;
+
+    if constexpr(saturate)
+    {
+        if((val.i32val & 0x7FFF) != 0x7FFF)
+        {
+            val.bhalf_vec[0] = ushort(
+                (bit_cast<uint32_t>(__builtin_amdgcn_fmed3f(
+                     bit_cast<float>(uint32_t{val.bhalf_vec[0]} << 16), 57344.0, -57344.0)) >>
+                 16)); // convert to float and back
+        }
+    }
+
+    val.i16_vec =
+        __builtin_amdgcn_cvt_scalef32_pk_bf8_bf16(i16x2val, val.bhalf_vec, /* scale */ 1.f, 0);
+
+    return val.i8val[0];
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E5M2_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == false, bool>                      = false>
+static __device__ fp8x2_storage_t cast_to_f8_from_bf16(ushortx2_t v, unsigned int rng = 0)
+{
+    ignore = rng;
+
+    union
+    {
+        ushortx2_t bhalf_vec;
+        shortx2_t i16_vec;
+        fp8_storage_t i8val[4];
+    } val;
+
+    constexpr shortx2_t i16x2val = {0, 0};
+    val.bhalf_vec                = v;
+
+    if constexpr(saturate)
+    {
+        if((val.i16_vec[0] & 0x7FFF) != 0x7FFF)
+        {
+            val.bhalf_vec[0] = ushort(
+                (bit_cast<uint32_t>(__builtin_amdgcn_fmed3f(
+                     bit_cast<float>(uint32_t{val.bhalf_vec[0]} << 16), 57344.0, -57344.0)) >>
+                 16)); // convert to float and back
+        }
+        if((val.i16_vec[1] & 0x7FFF) != 0x7FFF)
+        {
+            val.bhalf_vec[1] = ushort(
+                (bit_cast<uint32_t>(__builtin_amdgcn_fmed3f(
+                     bit_cast<float>(uint32_t{val.bhalf_vec[1]} << 16), 57344.0, -57344.0)) >>
+                 16)); // convert to float and back
+        }
+    }
+
+    val.i16_vec =
+        __builtin_amdgcn_cvt_scalef32_pk_bf8_bf16(i16x2val, val.bhalf_vec, /* scale */ 1.f, 0);
+
+    return fp8x2_storage_t{val.i8val[0], val.i8val[1]};
+}
+#endif // defined(__gfx950__)
+
 #if CK_FP8_CVT_FAST_PATH
 // The conversion function is from rocblas
 // https://github.com/ROCm/rocBLAS/blob/9b7f692abe3c54b88d1e77e045a7db7f1f188b69/library/include/internal/rocblas_float8.h#L79
@@ -523,6 +1030,84 @@ static __device__ fp8_storage_t cast_to_f8_from_f32(float v, unsigned int rng =
     }
     return i8data;
 }
+
+template <ck_fp8_interpretation_t interpret, bool saturate, bool stochastic_rounding = false>
+static __device__ fp8x2_storage_t cast_to_f8_from_f32(float2_t v, unsigned int rng = 0)
+{
+    if constexpr(stochastic_rounding)
+    {
+        // there is no packed conversion with SR, so convert one element at a time
+        return fp8x2_storage_t{
+            cast_to_f8_from_f32<interpret, saturate, stochastic_rounding>(v[0], rng),
+            cast_to_f8_from_f32<interpret, saturate, stochastic_rounding>(v[1], rng)};
+    }
+    else
+    {
+        union
+        {
+            float fval;
+            unsigned int i32val;
+            unsigned char i8val[4];
+        } val0, val1;
+
+        val0.fval = v[0];
+        val1.fval = v[1];
+
+        unsigned int ival = 0;
+
+        if constexpr(saturate)
+        {
+            if constexpr(interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ)
+            {
+                if((val0.i32val & 0x7F800000) != 0x7F800000)
+                { /// propagate NAN/INF, no clipping
+                    val0.fval = __builtin_amdgcn_fmed3f(val0.fval, 240.0, -240.0);
+                }
+                if((val1.i32val & 0x7F800000) != 0x7F800000)
+                { /// propagate NAN/INF, no clipping
+                    val1.fval = __builtin_amdgcn_fmed3f(val1.fval, 240.0, -240.0);
+                }
+            }
+            else if constexpr(interpret == ck_fp8_interpretation_t::CK_E4M3_OCP)
+            { // OCP type
+                if((val0.i32val & 0x7F800000) != 0x7F800000)
+                { /// propagate NAN/INF, no clipping
+                    val0.fval = __builtin_amdgcn_fmed3f(val0.fval, 448.0, -448.0);
+                }
+                if((val1.i32val & 0x7F800000) != 0x7F800000)
+                { /// propagate NAN/INF, no clipping
+                    val1.fval = __builtin_amdgcn_fmed3f(val1.fval, 448.0, -448.0);
+                }
+            }
+            else
+            {
+                if((val0.i32val & 0x7F800000) != 0x7F800000)
+                { /// propagate NAN/INF, no clipping
+                    val0.fval = __builtin_amdgcn_fmed3f(val0.fval, 57344.0, -57344.0);
+                }
+                if((val1.i32val & 0x7F800000) != 0x7F800000)
+                { /// propagate NAN/INF, no clipping
+                    val1.fval = __builtin_amdgcn_fmed3f(val1.fval, 57344.0, -57344.0);
+                }
+            }
+        }
+
+        // RNE CVT
+        if constexpr((interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ) ||
+                     (interpret == ck_fp8_interpretation_t::CK_E4M3_OCP))
+        {
+            ival = __builtin_amdgcn_cvt_pk_fp8_f32(val0.fval, val1.fval, ival, false);
+        }
+        else
+        {
+            ival = __builtin_amdgcn_cvt_pk_bf8_f32(val0.fval, val1.fval, ival, false);
+        }
+
+        val0.i32val = ival;
+
+        return fp8x2_storage_t{val0.i8val[0], val0.i8val[1]};
+    }
+}
 #endif // CK_FP8_CVT_FAST_PATH
 
 // The conversion function is from rocblas
@@ -539,10 +1124,10 @@ __host__ __device__ static inline fp8_storage_t cast_to_f8(T _x, unsigned int rn
 
     constexpr int mfmt = (sizeof(T) == 8) ? 52 : ((sizeof(T) == 4) ? 23 : 10);
 
-    using T_bitwise = typename std::conditional<
+    using T_bitwise = typename ck::conditional_t<
         sizeof(T) == 2,
         unsigned short int,
-        typename std::conditional<sizeof(T) == 4, unsigned int, unsigned long long>::type>::type;
+        typename ck::conditional_t<sizeof(T) == 4, unsigned int, unsigned long long>>;
     T_bitwise x_bitwise = bit_cast<T_bitwise>(_x);
 
     unsigned long long x{x_bitwise};
@@ -797,6 +1382,7 @@ __host__ __device__ static inline fp8_storage_t cast_to_f8(T _x, unsigned int rn
  *
  * \tparam interp interpretation of fp8
  * \tparam sat saturation of fp8
+ * \tparam stochastic_rounding switch between RNE and SR
  * \param f float number
  * \return fp8_storage_t
  */
@@ -882,6 +1468,47 @@ __host__ static inline fp8_storage_t cvt_float_to_fp8(const float f)
 #endif // CK_FP8_CVT_FAST_PATH
 }
 
+/**
+ * \brief convert vector of 2 floats to vector of 2 @p fp8_storage_t
+ *
+ * \tparam interp interpretation of fp8
+ * \tparam sat saturation of fp8
+ * \tparam stochastic_rounding switch between RNE and SR
+ * \param f vector of 2 floats
+ * \return fp8x2_storage_t
+ */
+template <ck_fp8_interpretation_t interp,
+          ck_saturation_t sat      = ck_saturation_t::CK_SATFINITE,
+          bool stochastic_rounding = false>
+#if CK_FP8_CVT_FAST_PATH
+__device__ static inline fp8x2_storage_t cvt_float_to_fp8(const float2_t f)
+{
+    __is_interpret_supported(interp);
+    uint32_t rng = 0;
+    if constexpr(stochastic_rounding)
+    {
+        constexpr int seed = 1254739;
+#ifndef CK_CODE_GEN_RTC
+        rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f[0]);
+#else
+        rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&f), f[0]);
+#endif
+    }
+    return cast_to_f8_from_f32<interp, sat == ck_saturation_t::CK_SATFINITE, stochastic_rounding>(
+        f, rng);
+#else
+#if CK_USE_OCP_FP8
+__host__ __device__ static inline fp8x2_storage_t cvt_float_to_fp8(const float2_t f)
+{
+#else
+__host__ static inline fp8x2_storage_t cvt_float_to_fp8(const float2_t f)
+{
+#endif // CK_USE_OCP_FP8
+    return fp8x2_storage_t{cvt_float_to_fp8<interp, sat, stochastic_rounding>(f[0]),
+                           cvt_float_to_fp8<interp, sat, stochastic_rounding>(f[1])};
+#endif // CK_FP8_CVT_FAST_PATH
+}
+
 /**
  * \brief convert _Float16 to @p fp8_storage_t
  *
@@ -900,87 +1527,168 @@ __host__ __device__ static inline fp8_storage_t cvt_half_t_to_fp8(const _Float16
 __host__ static inline fp8_storage_t cvt_half_t_to_fp8(const _Float16 x)
 #endif
 {
-    return cvt_float_to_fp8<interp, sat, stochastic_rounding>(static_cast<float>(x));
+    {
+        __is_interpret_supported(interp);
+        uint32_t rng = 0;
+        if constexpr(stochastic_rounding)
+        {
+            constexpr int seed = 1254739;
+#ifndef CK_CODE_GEN_RTC
+            rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+#else
+            rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
+#endif
+        }
+#if defined(__gfx950__)
+        return cast_to_f8_from_f16<interp,
+                                   sat == ck_saturation_t::CK_SATFINITE,
+                                   stochastic_rounding>(x, rng);
+#else
+        ignore = rng;
+        return cvt_float_to_fp8<interp, ck_saturation_t::CK_SATFINITE, stochastic_rounding>(
+            static_cast<float>(x));
+#endif // defined(__gfx950__)
+    }
+}
+
+/**
+ * \brief convert vector of 2 _Float16 to vector of 2 @p fp8_storage_t
+ *
+ * \tparam sat saturation of fp8
+ * \tparam interp interpretation of fp8
+ * \tparam stochastic_rounding switch between RNE and SR
+ * \param x vector of 2 _Float16
+ * \return fp8x2_storage_t
+ */
+template <ck_fp8_interpretation_t interp,
+          ck_saturation_t sat      = ck_saturation_t::CK_SATFINITE,
+          bool stochastic_rounding = false>
+#if CK_FP8_CVT_FAST_PATH || CK_USE_OCP_FP8
+__host__ __device__ static inline fp8x2_storage_t cvt_half_t_to_fp8(const half2_t x)
+#else
+__host__ static inline fp8x2_storage_t cvt_half_t_to_fp8(const half2_t x)
+#endif
+{
+    {
+        __is_interpret_supported(interp);
+        uint32_t rng = 0;
+        if constexpr(stochastic_rounding)
+        {
+            constexpr int seed = 1254739;
+#ifndef CK_CODE_GEN_RTC
+            rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x[0]);
+#else
+            rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x[0]);
+#endif
+        }
+#if defined(__gfx950__)
+        return cast_to_f8_from_f16<interp,
+                                   sat == ck_saturation_t::CK_SATFINITE,
+                                   stochastic_rounding>(x, rng);
+#else
+        ignore = rng;
+        return cvt_float_to_fp8<interp, ck_saturation_t::CK_SATFINITE, stochastic_rounding>(
+            float2_t{static_cast<float>(x[0]), static_cast<float>(x[1])});
+#endif // defined(__gfx950__)
+    }
+}
+
+/**
+ * \brief convert bhalf_t to @p fp8_storage_t
+ *
+ * \tparam sat saturation of fp8
+ * \tparam interp interpretation of fp8
+ * \tparam stochastic_rounding switch between RNE and SR
+ * \param x bhalf_t value
+ * \return fp8_storage_t
+ */
+template <ck_fp8_interpretation_t interp,
+          ck_saturation_t sat      = ck_saturation_t::CK_SATFINITE,
+          bool stochastic_rounding = false>
+#if CK_FP8_CVT_FAST_PATH || CK_USE_OCP_FP8
+__host__ __device__ static inline fp8_storage_t cvt_bhalf_t_to_fp8(const ushort x)
+#else
+__host__ static inline fp8_storage_t cvt_bhalf_t_to_fp8(const ushort x)
+#endif
+{
+    {
+        __is_interpret_supported(interp);
+        uint32_t rng = 0;
+        if constexpr(stochastic_rounding)
+        {
+            constexpr int seed = 1254739;
+#ifndef CK_CODE_GEN_RTC
+            rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x),
+                                               static_cast<float>(x));
+#else
+            rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), static_cast<float>(x));
+#endif
+        }
+#if defined(__gfx950__)
+        return cast_to_f8_from_bf16<interp,
+                                    sat == ck_saturation_t::CK_SATFINITE,
+                                    stochastic_rounding>(x, rng);
+#else
+        ignore = rng;
+        return cvt_float_to_fp8<interp, ck_saturation_t::CK_SATFINITE, stochastic_rounding>(
+            bit_cast<float>(uint32_t{x} << 16)); // convert value to float
+#endif // defined(__gfx950__)
+    }
+}
+
+/**
+ * \brief convert vector of 2 bhalf_t to vector of 2 @p fp8_storage_t
+ *
+ * \tparam sat saturation of fp8
+ * \tparam interp interpretation of fp8
+ * \tparam stochastic_rounding switch between RNE and SR
+ * \param x vector of 2 bhalf_t
+ * \return fp8x2_storage_t
+ */
+template <ck_fp8_interpretation_t interp,
+          ck_saturation_t sat      = ck_saturation_t::CK_SATFINITE,
+          bool stochastic_rounding = false>
+#if CK_FP8_CVT_FAST_PATH || CK_USE_OCP_FP8
+__host__ __device__ static inline fp8x2_storage_t cvt_bhalf_t_to_fp8(const ushortx2_t x)
+#else
+__host__ static inline fp8x2_storage_t cvt_bhalf_t_to_fp8(const ushortx2_t x)
+#endif
+{
+#if CK_WORKAROUND_BF16_TO_FP8_CONVERSION
+    return cvt_float_to_fp8<interp, ck_saturation_t::CK_SATFINITE, stochastic_rounding>(
+        float2_t{bit_cast<float>(uint32_t{x[0]} << 16),
+                 bit_cast<float>(uint32_t{x[1]} << 16)}); // convert values to float
+#else                                                     // CK_WORKAROUND_BF16_TO_FP8_CONVERSION
+    {
+        __is_interpret_supported(interp);
+        uint32_t rng = 0;
+        if constexpr(stochastic_rounding)
+        {
+            constexpr int seed = 1254739;
+#ifndef CK_CODE_GEN_RTC
+            rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x),
+                                               static_cast<float>(x[0]));
+#else
+            rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x),
+                                               static_cast<float>(x[0]));
+#endif
+        }
+#if defined(__gfx950__)
+        return cast_to_f8_from_bf16<interp,
+                                    sat == ck_saturation_t::CK_SATFINITE,
+                                    stochastic_rounding>(x, rng);
+#else
+        ignore = rng;
+        return cvt_float_to_fp8<interp, ck_saturation_t::CK_SATFINITE, stochastic_rounding>(
+            float2_t{bit_cast<float>(uint32_t{x[0]} << 16),
+                     bit_cast<float>(uint32_t{x[1]} << 16)}); // convert values to float
+#endif // defined(__gfx950__)
+    }
+#endif // CK_WORKAROUND_BF16_TO_FP8_CONVERSION
 }
 
 } // namespace fp8_impl
 
-// Declare a template function for fp8 conversion using RNE
-template <typename Y, typename X>
-__host__ __device__ constexpr Y f8_convert_rne(X x);
-
-// convert fp32 to fp8 with rounding to nearest even
-template <>
-inline __host__ __device__ f8_ocp_t f8_convert_rne<f8_ocp_t, float>(float x)
-{
-    return f8_ocp_t{
-        fp8_impl::cvt_float_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation>(x)};
-}
-
-// convert fp32 to bf8 with rounding to nearest even
-template <>
-inline __host__ __device__ bf8_ocp_t f8_convert_rne<bf8_ocp_t, float>(float x)
-{
-    return bf8_ocp_t{
-        fp8_impl::cvt_float_to_fp8<bf8_ocp_t::default_interpret, bf8_ocp_t::default_saturation>(x)};
-}
-
-// convert _Float16 to fp8 with rounding to nearest even
-template <>
-inline __host__ __device__ f8_ocp_t f8_convert_rne<f8_ocp_t, _Float16>(_Float16 x)
-{
-    return f8_ocp_t{
-        fp8_impl::cvt_half_t_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation>(x)};
-}
-
-template <>
-inline __host__ __device__ bf8_ocp_t f8_convert_rne<bf8_ocp_t, _Float16>(_Float16 x)
-{
-    return bf8_ocp_t{
-        fp8_impl::cvt_half_t_to_fp8<bf8_ocp_t::default_interpret, bf8_ocp_t::default_saturation>(
-            x)};
-}
-
-// Declare a template function for fp8 conversion using RNE
-template <typename Y, typename X>
-__host__ __device__ constexpr Y f8_convert_sr(X x);
-
-// convert fp32 to fp8 with stochastic rounding
-template <>
-inline __host__ __device__ f8_ocp_t f8_convert_sr<f8_ocp_t, float>(float x)
-{
-    return f8_ocp_t{
-        fp8_impl::cvt_float_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation, true>(
-            x)};
-}
-
-// convert fp32 to bf8 with stochastic rounding
-template <>
-inline __host__ __device__ bf8_ocp_t f8_convert_sr<bf8_ocp_t, float>(float x)
-{
-    return bf8_ocp_t{fp8_impl::cvt_float_to_fp8<bf8_ocp_t::default_interpret,
-                                                bf8_ocp_t::default_saturation,
-                                                true>(x)};
-}
-
-// convert _Float16 to fp8 with stochastic rounding
-template <>
-inline __host__ __device__ f8_ocp_t f8_convert_sr<f8_ocp_t, _Float16>(_Float16 x)
-{
-    return f8_ocp_t{fp8_impl::cvt_half_t_to_fp8<f8_ocp_t::default_interpret,
-                                                f8_ocp_t::default_saturation,
-                                                true>(x)};
-}
-
-// convert _Float16 to bf8 with stochastic rounding
-template <>
-inline __host__ __device__ bf8_ocp_t f8_convert_sr<bf8_ocp_t, _Float16>(_Float16 x)
-{
-    return bf8_ocp_t{fp8_impl::cvt_half_t_to_fp8<bf8_ocp_t::default_interpret,
-                                                 bf8_ocp_t::default_saturation,
-                                                 true>(x)};
-}
-
 #if CK_USE_OCP_FP8
 using f8_t  = f8_ocp_t;
 using bf8_t = bf8_ocp_t;
diff --git a/include/ck/utility/amd_inline_asm.hpp b/include/ck/utility/amd_inline_asm.hpp
index 113f3af4ae..0ed60df2c3 100644
--- a/include/ck/utility/amd_inline_asm.hpp
+++ b/include/ck/utility/amd_inline_asm.hpp
@@ -5,12 +5,19 @@
 #define CK_AMD_INLINE_ASM_HPP
 
 #include "c_style_pointer_cast.hpp"
-#include "data_type.hpp"
+#include "dtype_vector.hpp"
 
 // TODO: deprecate all amd_assembly_outer_product_xxx
 
 namespace ck {
 
+inline __device__ int amd_assembly_and_b32(int a, int b)
+{
+    int c;
+    asm volatile("v_and_b32 %0, %1, %2" : "=v"(c) : "v"(a), "v"(b));
+    return c;
+}
+
 inline __device__ int amd_assembly_and_or_b32(int a, int b, int d)
 {
     int c;
@@ -32,6 +39,54 @@ inline __device__ half2_t amd_assembly_pk_add_f16(half2_t a, half2_t b)
     return c;
 }
 
+inline __device__ float amd_assemble_cvt_f32_i4(int b)
+{
+    float a;
+    asm volatile("v_cvt_off_f32_i4 %0, %1" : "=v"(a) : "v"(b));
+    return a;
+}
+
+inline __device__ f8x4_t amd_assembly_cvt_f8_to_f32(float b0, float b1, float b2, float b3)
+{
+    f8x4_t a;
+    asm volatile("v_cvt_pk_fp8_f32 %0, %1, %2\n"
+                 "v_cvt_pk_fp8_f32 %0, %3, %4, op_sel:[0, 0, 1]\n"
+                 : "=v"(a)
+                 : "v"(b0), "v"(b1), "v"(b2), "v"(b3));
+    return a;
+}
+
+inline __device__ f8x8_t amd_assembly_i4_to_fp8x8(int a)
+{
+    uint32_t i4x8 = static_cast<uint32_t>(a);
+    uint32_t fp8x4_0;
+    uint32_t fp8x4_1;
+    float tmp_0, tmp_1, tmp_2;
+
+    asm volatile("v_cvt_off_f32_i4 %[v_tmp_0], %[v_src]\n"
+                 "v_cvt_off_f32_i4 %[v_tmp_1], %[v_src], src0_sel:BYTE_2\n"
+                 "v_cvt_pk_fp8_f32 %[v_dst_0], %[v_tmp_0], %[v_tmp_1]\n"
+                 "v_cvt_off_f32_i4 %[v_tmp_0], %[v_src], src0_sel:BYTE_1\n"
+                 "v_cvt_off_f32_i4 %[v_tmp_1], %[v_src], src0_sel:BYTE_3\n"
+                 "v_cvt_pk_fp8_f32 %[v_dst_1], %[v_tmp_0], %[v_tmp_1]\n"
+                 "v_lshrrev_b32 %[v_tmp_2], 4, %[v_src]\n"
+                 "v_cvt_off_f32_i4 %[v_tmp_0], %[v_tmp_2]\n"
+                 "v_cvt_off_f32_i4 %[v_tmp_1], %[v_tmp_2], src0_sel:BYTE_2\n"
+                 "v_cvt_pk_fp8_f32 %[v_dst_0], %[v_tmp_0], %[v_tmp_1], op_sel:[0, 0, 1]\n"
+                 "v_cvt_off_f32_i4 %[v_tmp_0], %[v_tmp_2], src0_sel:BYTE_1\n"
+                 "v_cvt_off_f32_i4 %[v_tmp_1], %[v_tmp_2], src0_sel:BYTE_3\n"
+                 "v_cvt_pk_fp8_f32 %[v_dst_1], %[v_tmp_0], %[v_tmp_1], op_sel:[0, 0, 1]\n"
+                 : [v_tmp_0] "+v"(tmp_0),
+                   [v_tmp_1] "+v"(tmp_1),
+                   [v_tmp_2] "+v"(tmp_2),
+                   [v_dst_0] "+v"(fp8x4_0),
+                   [v_dst_1] "+v"(fp8x4_1),
+                   [v_src] "+v"(i4x8)
+                 :);
+
+    return bit_cast<f8x8_t>(((static_cast<uint64_t>(fp8x4_1) << 32) | fp8x4_0));
+}
+
 // c0 += inner_product(a, b0)
 // c1 += inner_product(a, b1)
 __device__ void amd_assembly_outer_product_1x2(float a, float b0, float b1, float& c0, float& c1)
diff --git a/include/ck/utility/amd_wmma.hpp b/include/ck/utility/amd_wmma.hpp
index aa519fb2be..e14c0d62a8 100644
--- a/include/ck/utility/amd_wmma.hpp
+++ b/include/ck/utility/amd_wmma.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_AMD_WMMA_HPP
 #define CK_AMD_WMMA_HPP
@@ -341,5 +341,101 @@ struct intrin_wmma_i32_16x16x16_iu8_w32_gfx12<16, 16, neg_a, neg_b, clamp>
     }
 };
 
+// src: f8, f8, dst: fp32
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_wmma_f32_16x16x16_f8f8_w32_gfx12;
+
+template <>
+struct intrin_wmma_f32_16x16x16_f8f8_w32_gfx12<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const f8x8_t& reg_a, const f8x8_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx12__)
+        reg_c.template AsType<float8_t>()(Number<0>{}) =
+            __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12(
+                bit_cast<int32x2_t>(reg_a),
+                bit_cast<int32x2_t>(reg_b),
+                reg_c.template AsType<float8_t>()[Number<0>{}]);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+#endif
+    }
+};
+
+// src: f8, bf8, dst: fp32
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_wmma_f32_16x16x16_f8bf8_w32_gfx12;
+
+template <>
+struct intrin_wmma_f32_16x16x16_f8bf8_w32_gfx12<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const f8x8_t& reg_a, const bf8x8_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx12__)
+        reg_c.template AsType<float8_t>()(Number<0>{}) =
+            __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12(
+                bit_cast<int32x2_t>(reg_a),
+                bit_cast<int32x2_t>(reg_b),
+                reg_c.template AsType<float8_t>()[Number<0>{}]);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+#endif
+    }
+};
+
+// src: bf8, f8, dst: fp32
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_wmma_f32_16x16x16_bf8f8_w32_gfx12;
+
+template <>
+struct intrin_wmma_f32_16x16x16_bf8f8_w32_gfx12<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const bf8x8_t& reg_a, const f8x8_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx12__)
+        reg_c.template AsType<float8_t>()(Number<0>{}) =
+            __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12(
+                bit_cast<int32x2_t>(reg_a),
+                bit_cast<int32x2_t>(reg_b),
+                reg_c.template AsType<float8_t>()[Number<0>{}]);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+#endif
+    }
+};
+
+// src: bf8, bf8, dst: fp32
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_wmma_f32_16x16x16_bf8bf8_w32_gfx12;
+
+template <>
+struct intrin_wmma_f32_16x16x16_bf8bf8_w32_gfx12<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const bf8x8_t& reg_a, const bf8x8_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx12__)
+        reg_c.template AsType<float8_t>()(Number<0>{}) =
+            __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12(
+                bit_cast<int32x2_t>(reg_a),
+                bit_cast<int32x2_t>(reg_b),
+                reg_c.template AsType<float8_t>()[Number<0>{}]);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+#endif
+    }
+};
+
 } // namespace ck
 #endif
diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
index d00c2ad554..fc2528c4d0 100644
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -2,10 +2,11 @@
 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
+#include "ck/utility/dtype_fp64.hpp"
 
 namespace ck {
 // Define the common macro for MI300 models
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
+#if defined(__gfx942__) || defined(__gfx950__)
 #define __gfx94__
 #endif
 
@@ -547,9 +548,9 @@ struct intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32>
 {
     template <class FloatC>
     __device__ static void Run(const f8x32_t& reg_a,
-                               const int32_t scale_a,
+                               const int32_t& scale_a,
                                const f8x32_t& reg_b,
-                               const int32_t scale_b,
+                               const int32_t& scale_b,
                                FloatC& reg_c)
     {
 #if defined(__gfx950__)
@@ -565,6 +566,14 @@ struct intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32>
                 scale_a,
                 0, // OPSEL
                 scale_b);
+        // XXX: Note on the scale_a and scale_b parameters:
+        // If compiler detects that one or both scales are constant values, it will treat that
+        // constant as F32 constant. I.e., if scale_a at some point was declared as
+        // `e8m0_bexp_t a_scale{1.0f}`, the instruction would only work if scale_a parameter is
+        // assigned value `bit_cast<int32_t>(static_cast<float>(a_scale))`.
+
+        // XXX: Note on the OPSEL parameters: Instruction always takes byte0 as a scale value even
+        // when OPSEL is set otherwise.
 #else
         ignore = reg_a;
         ignore = scale_a;
@@ -617,9 +626,9 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16>
 {
     template <class FloatC>
     __device__ static void Run(const f8x32_t& reg_a,
-                               const int32_t scale_a,
+                               const int32_t& scale_a,
                                const f8x32_t& reg_b,
-                               const int32_t scale_b,
+                               const int32_t& scale_b,
                                FloatC& reg_c)
     {
 #if defined(__gfx950__)
@@ -644,6 +653,34 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16>
 #endif
     }
 
+    __device__ static void Run(const bf8x32_t& reg_a,
+                               const int32_t& scale_a,
+                               const bf8x32_t& reg_b,
+                               const int32_t& scale_b,
+                               FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+        // https://github.com/ROCm/llvm-project/blob/656552edc693e2bb4abc9258399c39d190fce2b3/llvm/test/Verifier/AMDGPU/mfma-scale.ll#L10
+        reg_c.template AsType<float4_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                reg_a,
+                reg_b,
+                reg_c.template AsType<float4_t>()[Number<0>{}],
+                1, // cbsz
+                1, // blgp
+                0, // OPSEL
+                scale_a,
+                0, // OPSEL
+                scale_b);
+#else
+        ignore = reg_a;
+        ignore = scale_a;
+        ignore = reg_b;
+        ignore = scale_b;
+        ignore = reg_c;
+#endif
+    }
+
     template <class FloatC>
     __device__ static void Run(const f4x32_t& reg_a,
                                const int32_t scale_a,
diff --git a/include/ck/utility/blkgemmpipe_scheduler.hpp b/include/ck/utility/blkgemmpipe_scheduler.hpp
index 574be5c4ad..39407cb8f6 100644
--- a/include/ck/utility/blkgemmpipe_scheduler.hpp
+++ b/include/ck/utility/blkgemmpipe_scheduler.hpp
@@ -85,7 +85,7 @@ struct BlockwiseGemmXdlops_pipeline_hotloop_inst
     static constexpr index_t A_LDS_Read_Inst_Num =
         WaveNumN * MPerBlock * KPerBlock / (BlockSize * ALDSReadWidth);
     static constexpr index_t B_LDS_Read_Inst_Num =
-        WaveNumM * MPerBlock * KPerBlock / (BlockSize * BLDSReadWidth);
+        WaveNumM * NPerBlock * KPerBlock / (BlockSize * BLDSReadWidth);
 
     static constexpr index_t C_MFMA_Inst_Num =
         MPerBlock * NPerBlock * KPerBlock / (BlockSize / WaveSize) / (MPerXDL * NPerXDL * KPerXDL);
diff --git a/include/ck/utility/common_header.hpp b/include/ck/utility/common_header.hpp
index f95660a8a4..c2c3aa002c 100644
--- a/include/ck/utility/common_header.hpp
+++ b/include/ck/utility/common_header.hpp
@@ -33,7 +33,11 @@
 #include "ck/utility/thread_group.hpp"
 #include "ck/utility/debug.hpp"
 
-#include "ck/utility/amd_buffer_addressing.hpp"
+#if __clang_major__ == 20
+#include "amd_buffer_addressing_builtins.hpp"
+#else
+#include "amd_buffer_addressing.hpp"
+#endif
 #include "ck/utility/amd_wave_read_first_lane.hpp"
 #include "ck/utility/generic_memory_space_atomic.hpp"
 #include "ck/utility/get_id.hpp"
diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp
index f7bdde172e..a6106bb146 100644
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -19,7 +19,7 @@ using float_t  = float;
 #endif // __HIPCC_RTC__
 
 namespace ck {
-#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
+#if defined(__HIPCC_RTC__) || defined(CK_CODE_GEN_RTC)
 using byte = unsigned char;
 #else
 using std::byte;
@@ -346,53 +346,6 @@ inline constexpr bool is_native_type()
            is_same<T, f6_t>::value || is_same<T, bf6_t>::value;
 }
 
-// vector_type
-template <typename T, index_t N, typename Enable = void>
-struct vector_type;
-
-// Caution: DO NOT REMOVE
-// intentionally have only declaration but no definition to cause compilation failure when trying to
-// instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
-// vectors"
-template <typename T, index_t V, index_t N>
-struct vector_type<T __attribute__((ext_vector_type(V))), N>;
-
-// Caution: DO NOT REMOVE
-// intentionally have only declaration but no definition to cause compilation failure when trying to
-// instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
-// vectors"
-template <typename T, index_t V, index_t N>
-struct vector_type<vector_type<T, V>, N>;
-
-// vector_type_maker
-// This is the right way to handle "vector of vectors": making a bigger vector instead
-template <typename T, index_t N>
-struct vector_type_maker
-{
-    using type = vector_type<T, N>;
-};
-
-template <typename T, index_t N0, index_t N1>
-struct vector_type_maker<T __attribute__((ext_vector_type(N1))), N0>
-{
-    using type = vector_type<T, N0 * N1>;
-};
-
-template <typename T, index_t N0, index_t N1>
-struct vector_type_maker<vector_type<T, N1>, N0>
-{
-    using type = vector_type<T, N0 * N1>;
-};
-
-template <typename T, index_t N>
-using vector_type_maker_t = typename vector_type_maker<T, N>::type;
-
-template <typename T, index_t N>
-__host__ __device__ constexpr auto make_vector_type(Number<N>)
-{
-    return typename vector_type_maker<T, N>::type{};
-}
-
 // scalar_type
 template <typename TV>
 struct scalar_type;
@@ -416,13 +369,6 @@ struct scalar_type<T __attribute__((ext_vector_type(N)))>
     static constexpr index_t vector_size = N;
 };
 
-template <typename T, index_t N>
-struct scalar_type<vector_type<T, N>>
-{
-    using type                           = T;
-    static constexpr index_t vector_size = N;
-};
-
 //
 template <>
 struct scalar_type<double>
@@ -518,9 +464,9 @@ struct scalar_type<bf8_ocp_t>
 };
 
 template <>
-struct scalar_type<bool>
+struct scalar_type<e8m0_bexp_t>
 {
-    using type                           = bool;
+    using type                           = e8m0_bexp_t::type;
     static constexpr index_t vector_size = 1;
 };
 
@@ -531,2868 +477,17 @@ struct scalar_type<f4x2_pk_t>
     static constexpr index_t vector_size = 1;
 };
 
-template <typename T>
-struct vector_type<T, 1, typename ck::enable_if_t<is_native_type<T>()>>
-{
-    using d1_t = T;
-    using type = d1_t;
-
-    union
-    {
-        T d1_;
-        StaticallyIndexedArray<T, 1> d1x1_;
-    } data_;
-
-    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
-
-    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
-
-    template <typename X>
-    __host__ __device__ constexpr const auto& AsType() const
-    {
-        static_assert(is_same<X, d1_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        return data_.d1x1_;
-    }
-
-    template <typename X>
-    __host__ __device__ constexpr auto& AsType()
-    {
-        static_assert(is_same<X, d1_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        return data_.d1x1_;
-    }
-};
-
-__device__ int static err = 0;
-template <typename T>
-struct vector_type<T, 2, typename ck::enable_if_t<is_native_type<T>()>>
-{
-    using d1_t = T;
-    typedef T d2_t __attribute__((ext_vector_type(2)));
-
-    using type = d2_t;
-
-    union
-    {
-        d2_t d2_;
-        StaticallyIndexedArray<d1_t, 2> d1x2_;
-        StaticallyIndexedArray<d2_t, 1> d2x1_;
-    } data_;
-
-    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
-
-    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
-
-    template <typename X>
-    __host__ __device__ constexpr const auto& AsType() const
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x2_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-
-    template <typename X>
-    __host__ __device__ constexpr auto& AsType()
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x2_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-};
-
-template <typename T>
-struct vector_type<T, 3, typename ck::enable_if_t<is_native_type<T>()>>
-{
-    using d1_t = T;
-    typedef T d2_t __attribute__((ext_vector_type(2)));
-    typedef T d3_t __attribute__((ext_vector_type(3)));
-
-    using type = d3_t;
-
-    union
-    {
-        d3_t d3_;
-        StaticallyIndexedArray<d1_t, 3> d1x3_;
-        StaticallyIndexedArray<d2_t, 1> d2x1_;
-        StaticallyIndexedArray<d3_t, 1> d3x1_;
-    } data_;
-
-    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
-
-    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
-
-    template <typename X>
-    __host__ __device__ constexpr const auto& AsType() const
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value || is_same<X, d3_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x3_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x1_;
-        }
-        else if constexpr(is_same<X, d3_t>::value)
-        {
-            return data_.d3x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-
-    template <typename X>
-    __host__ __device__ constexpr auto& AsType()
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value || is_same<X, d3_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x3_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x1_;
-        }
-        else if constexpr(is_same<X, d3_t>::value)
-        {
-            return data_.d3x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-};
-
-template <typename T>
-struct vector_type<T, 4, typename ck::enable_if_t<is_native_type<T>()>>
-{
-    using d1_t = T;
-    typedef T d2_t __attribute__((ext_vector_type(2)));
-    typedef T d4_t __attribute__((ext_vector_type(4)));
-
-    using type = d4_t;
-
-    union
-    {
-        d4_t d4_;
-        StaticallyIndexedArray<d1_t, 4> d1x4_;
-        StaticallyIndexedArray<d2_t, 2> d2x2_;
-        StaticallyIndexedArray<d4_t, 1> d4x1_;
-    } data_;
-
-    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
-
-    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
-
-    template <typename X>
-    __host__ __device__ constexpr const auto& AsType() const
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value || is_same<X, d4_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x4_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x2_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-
-    template <typename X>
-    __host__ __device__ constexpr auto& AsType()
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value || is_same<X, d4_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x4_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x2_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-};
-
-template <typename T>
-struct vector_type<T, 5, typename ck::enable_if_t<is_native_type<T>()>>
-{
-    using d1_t = T;
-    typedef T d4_t __attribute__((ext_vector_type(4)));
-    typedef T d5_t __attribute__((ext_vector_type(5)));
-
-    using type = d5_t;
-
-    union
-    {
-        d5_t d5_;
-        StaticallyIndexedArray<d1_t, 5> d1x5_;
-        StaticallyIndexedArray<d4_t, 1> d4x1_;
-        StaticallyIndexedArray<d5_t, 1> d5x1_;
-    } data_;
-
-    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
-
-    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
-
-    template <typename X>
-    __host__ __device__ constexpr const auto& AsType() const
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d4_t>::value || is_same<X, d5_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x5_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x1_;
-        }
-        else if constexpr(is_same<X, d5_t>::value)
-        {
-            return data_.d5x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-
-    template <typename X>
-    __host__ __device__ constexpr auto& AsType()
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d4_t>::value || is_same<X, d5_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x5_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x1_;
-        }
-        else if constexpr(is_same<X, d5_t>::value)
-        {
-            return data_.d5x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-};
-
-template <typename T>
-struct vector_type<T, 7, typename ck::enable_if_t<is_native_type<T>()>>
-{
-    using d1_t = T;
-    typedef T d2_t __attribute__((ext_vector_type(2)));
-    typedef T d4_t __attribute__((ext_vector_type(4)));
-    typedef T d7_t __attribute__((ext_vector_type(7)));
-
-    using type = d7_t;
-
-    union
-    {
-        d7_t d7_;
-        StaticallyIndexedArray<d1_t, 7> d1x7_;
-        StaticallyIndexedArray<d2_t, 3> d2x3_;
-        StaticallyIndexedArray<d4_t, 1> d4x1_;
-        StaticallyIndexedArray<d7_t, 1> d7x1_;
-    } data_;
-
-    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
-
-    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
-
-    template <typename X>
-    __host__ __device__ constexpr const auto& AsType() const
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
-                          is_same<X, d4_t>::value || is_same<X, d7_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x7_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x3_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x1_;
-        }
-        else if constexpr(is_same<X, d7_t>::value)
-        {
-            return data_.d7x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-
-    template <typename X>
-    __host__ __device__ constexpr auto& AsType()
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
-                          is_same<X, d4_t>::value || is_same<X, d7_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x7_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x3_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x1_;
-        }
-        else if constexpr(is_same<X, d7_t>::value)
-        {
-            return data_.d7x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-};
-
-template <typename T>
-struct vector_type<T, 8, typename ck::enable_if_t<is_native_type<T>()>>
-{
-    using d1_t = T;
-    typedef T d2_t __attribute__((ext_vector_type(2)));
-    typedef T d4_t __attribute__((ext_vector_type(4)));
-    typedef T d8_t __attribute__((ext_vector_type(8)));
-
-    using type = d8_t;
-
-    union
-    {
-        d8_t d8_;
-        StaticallyIndexedArray<d1_t, 8> d1x8_;
-        StaticallyIndexedArray<d2_t, 4> d2x4_;
-        StaticallyIndexedArray<d4_t, 2> d4x2_;
-        StaticallyIndexedArray<d8_t, 1> d8x1_;
-    } data_;
-
-    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
-
-    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
-
-    template <typename X>
-    __host__ __device__ constexpr const auto& AsType() const
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
-                          is_same<X, d4_t>::value || is_same<X, d8_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x8_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x4_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x2_;
-        }
-        else if constexpr(is_same<X, d8_t>::value)
-        {
-            return data_.d8x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-
-    template <typename X>
-    __host__ __device__ constexpr auto& AsType()
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
-                          is_same<X, d4_t>::value || is_same<X, d8_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x8_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x4_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x2_;
-        }
-        else if constexpr(is_same<X, d8_t>::value)
-        {
-            return data_.d8x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-};
-
-template <typename T>
-struct vector_type<T, 13, typename ck::enable_if_t<is_native_type<T>()>>
-{
-    using d1_t = T;
-    typedef T d4_t __attribute__((ext_vector_type(4)));
-    typedef T d8_t __attribute__((ext_vector_type(8)));
-    typedef T d13_t __attribute__((ext_vector_type(13)));
-
-    using type = d13_t;
-
-    union
-    {
-        d13_t d13_;
-        StaticallyIndexedArray<d1_t, 13> d1x13_;
-        StaticallyIndexedArray<d4_t, 3> d4x3_;
-        StaticallyIndexedArray<d8_t, 1> d8x1_;
-        StaticallyIndexedArray<d13_t, 1> d13x1_;
-    } data_;
-
-    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
-
-    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
-
-    template <typename X>
-    __host__ __device__ constexpr const auto& AsType() const
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d4_t>::value ||
-                          is_same<X, d8_t>::value || is_same<X, d13_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x13_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x3_;
-        }
-        else if constexpr(is_same<X, d8_t>::value)
-        {
-            return data_.d8x1_;
-        }
-        else if constexpr(is_same<X, d13_t>::value)
-        {
-            return data_.d13x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-
-    template <typename X>
-    __host__ __device__ constexpr auto& AsType()
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d4_t>::value ||
-                          is_same<X, d8_t>::value || is_same<X, d13_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x13_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x3_;
-        }
-        else if constexpr(is_same<X, d8_t>::value)
-        {
-            return data_.d8x1_;
-        }
-        else if constexpr(is_same<X, d13_t>::value)
-        {
-            return data_.d13x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-};
-
-template <typename T>
-struct vector_type<T, 16, typename ck::enable_if_t<is_native_type<T>()>>
-{
-    using d1_t = T;
-    typedef T d2_t __attribute__((ext_vector_type(2)));
-    typedef T d4_t __attribute__((ext_vector_type(4)));
-    typedef T d8_t __attribute__((ext_vector_type(8)));
-    typedef T d16_t __attribute__((ext_vector_type(16)));
-
-    using type = d16_t;
-
-    union
-    {
-        d16_t d16_;
-        StaticallyIndexedArray<d1_t, 16> d1x16_;
-        StaticallyIndexedArray<d2_t, 8> d2x8_;
-        StaticallyIndexedArray<d4_t, 4> d4x4_;
-        StaticallyIndexedArray<d8_t, 2> d8x2_;
-        StaticallyIndexedArray<d16_t, 1> d16x1_;
-    } data_;
-
-    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
-
-    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
-
-    template <typename X>
-    __host__ __device__ constexpr const auto& AsType() const
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
-                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
-                          is_same<X, d16_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x16_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x8_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x4_;
-        }
-        else if constexpr(is_same<X, d8_t>::value)
-        {
-            return data_.d8x2_;
-        }
-        else if constexpr(is_same<X, d16_t>::value)
-        {
-            return data_.d16x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-
-    template <typename X>
-    __host__ __device__ constexpr auto& AsType()
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
-                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
-                          is_same<X, d16_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x16_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x8_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x4_;
-        }
-        else if constexpr(is_same<X, d8_t>::value)
-        {
-            return data_.d8x2_;
-        }
-        else if constexpr(is_same<X, d16_t>::value)
-        {
-            return data_.d16x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-};
-
-template <typename T>
-struct vector_type<T, 32, typename ck::enable_if_t<is_native_type<T>()>>
-{
-    using d1_t = T;
-    typedef T d2_t __attribute__((ext_vector_type(2)));
-    typedef T d4_t __attribute__((ext_vector_type(4)));
-    typedef T d8_t __attribute__((ext_vector_type(8)));
-    typedef T d16_t __attribute__((ext_vector_type(16)));
-    typedef T d32_t __attribute__((ext_vector_type(32)));
-
-    using type = d32_t;
-
-    union
-    {
-        d32_t d32_;
-        StaticallyIndexedArray<d1_t, 32> d1x32_;
-        StaticallyIndexedArray<d2_t, 16> d2x16_;
-        StaticallyIndexedArray<d4_t, 8> d4x8_;
-        StaticallyIndexedArray<d8_t, 4> d8x4_;
-        StaticallyIndexedArray<d16_t, 2> d16x2_;
-        StaticallyIndexedArray<d32_t, 1> d32x1_;
-    } data_;
-
-    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
-
-    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
-
-    template <typename X>
-    __host__ __device__ constexpr const auto& AsType() const
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
-                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
-                          is_same<X, d16_t>::value || is_same<X, d32_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x32_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x16_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x8_;
-        }
-        else if constexpr(is_same<X, d8_t>::value)
-        {
-            return data_.d8x4_;
-        }
-        else if constexpr(is_same<X, d16_t>::value)
-        {
-            return data_.d16x2_;
-        }
-        else if constexpr(is_same<X, d32_t>::value)
-        {
-            return data_.d32x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-
-    template <typename X>
-    __host__ __device__ constexpr auto& AsType()
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
-                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
-                          is_same<X, d16_t>::value || is_same<X, d32_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x32_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x16_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x8_;
-        }
-        else if constexpr(is_same<X, d8_t>::value)
-        {
-            return data_.d8x4_;
-        }
-        else if constexpr(is_same<X, d16_t>::value)
-        {
-            return data_.d16x2_;
-        }
-        else if constexpr(is_same<X, d32_t>::value)
-        {
-            return data_.d32x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-};
-
-template <typename T>
-struct vector_type<T, 64, typename ck::enable_if_t<is_native_type<T>()>>
-{
-    using d1_t = T;
-    typedef T d2_t __attribute__((ext_vector_type(2)));
-    typedef T d4_t __attribute__((ext_vector_type(4)));
-    typedef T d8_t __attribute__((ext_vector_type(8)));
-    typedef T d16_t __attribute__((ext_vector_type(16)));
-    typedef T d32_t __attribute__((ext_vector_type(32)));
-    typedef T d64_t __attribute__((ext_vector_type(64)));
-
-    using type = d64_t;
-
-    union
-    {
-        d64_t d64_;
-        StaticallyIndexedArray<d1_t, 64> d1x64_;
-        StaticallyIndexedArray<d2_t, 32> d2x32_;
-        StaticallyIndexedArray<d4_t, 16> d4x16_;
-        StaticallyIndexedArray<d8_t, 8> d8x8_;
-        StaticallyIndexedArray<d16_t, 4> d16x4_;
-        StaticallyIndexedArray<d32_t, 2> d32x2_;
-        StaticallyIndexedArray<d64_t, 1> d64x1_;
-    } data_;
-
-    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
-
-    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
-
-    template <typename X>
-    __host__ __device__ constexpr const auto& AsType() const
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
-                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
-                          is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
-                          is_same<X, d64_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x64_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x32_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x16_;
-        }
-        else if constexpr(is_same<X, d8_t>::value)
-        {
-            return data_.d8x8_;
-        }
-        else if constexpr(is_same<X, d16_t>::value)
-        {
-            return data_.d16x4_;
-        }
-        else if constexpr(is_same<X, d32_t>::value)
-        {
-            return data_.d32x2_;
-        }
-        else if constexpr(is_same<X, d64_t>::value)
-        {
-            return data_.d64x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-
-    template <typename X>
-    __host__ __device__ constexpr auto& AsType()
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
-                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
-                          is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
-                          is_same<X, d64_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x64_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x32_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x16_;
-        }
-        else if constexpr(is_same<X, d8_t>::value)
-        {
-            return data_.d8x8_;
-        }
-        else if constexpr(is_same<X, d16_t>::value)
-        {
-            return data_.d16x4_;
-        }
-        else if constexpr(is_same<X, d32_t>::value)
-        {
-            return data_.d32x2_;
-        }
-        else if constexpr(is_same<X, d64_t>::value)
-        {
-            return data_.d64x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-};
-
-template <typename T>
-struct vector_type<T, 128, typename ck::enable_if_t<is_native_type<T>()>>
-{
-    using d1_t = T;
-    typedef T d2_t __attribute__((ext_vector_type(2)));
-    typedef T d4_t __attribute__((ext_vector_type(4)));
-    typedef T d8_t __attribute__((ext_vector_type(8)));
-    typedef T d16_t __attribute__((ext_vector_type(16)));
-    typedef T d32_t __attribute__((ext_vector_type(32)));
-    typedef T d64_t __attribute__((ext_vector_type(64)));
-    typedef T d128_t __attribute__((ext_vector_type(128)));
-
-    using type = d128_t;
-
-    union
-    {
-        d128_t d128_;
-        StaticallyIndexedArray<d1_t, 128> d1x128_;
-        StaticallyIndexedArray<d2_t, 64> d2x64_;
-        StaticallyIndexedArray<d4_t, 32> d4x32_;
-        StaticallyIndexedArray<d8_t, 16> d8x16_;
-        StaticallyIndexedArray<d16_t, 8> d16x8_;
-        StaticallyIndexedArray<d32_t, 4> d32x4_;
-        StaticallyIndexedArray<d64_t, 2> d64x2_;
-        StaticallyIndexedArray<d128_t, 1> d128x1_;
-    } data_;
-
-    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
-
-    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
-
-    template <typename X>
-    __host__ __device__ constexpr const auto& AsType() const
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
-                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
-                          is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
-                          is_same<X, d64_t>::value || is_same<X, d128_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x128_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x64_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x32_;
-        }
-        else if constexpr(is_same<X, d8_t>::value)
-        {
-            return data_.d8x16_;
-        }
-        else if constexpr(is_same<X, d16_t>::value)
-        {
-            return data_.d16x8_;
-        }
-        else if constexpr(is_same<X, d32_t>::value)
-        {
-            return data_.d32x4_;
-        }
-        else if constexpr(is_same<X, d64_t>::value)
-        {
-            return data_.d64x2_;
-        }
-        else if constexpr(is_same<X, d128_t>::value)
-        {
-            return data_.d128x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-
-    template <typename X>
-    __host__ __device__ constexpr auto& AsType()
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
-                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
-                          is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
-                          is_same<X, d64_t>::value || is_same<X, d128_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x128_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x64_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x32_;
-        }
-        else if constexpr(is_same<X, d8_t>::value)
-        {
-            return data_.d8x16_;
-        }
-        else if constexpr(is_same<X, d16_t>::value)
-        {
-            return data_.d16x8_;
-        }
-        else if constexpr(is_same<X, d32_t>::value)
-        {
-            return data_.d32x4_;
-        }
-        else if constexpr(is_same<X, d64_t>::value)
-        {
-            return data_.d64x2_;
-        }
-        else if constexpr(is_same<X, d128_t>::value)
-        {
-            return data_.d128x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-};
-
-template <typename T>
-struct vector_type<T, 256, typename ck::enable_if_t<is_native_type<T>()>>
-{
-    using d1_t = T;
-    typedef T d2_t __attribute__((ext_vector_type(2)));
-    typedef T d4_t __attribute__((ext_vector_type(4)));
-    typedef T d8_t __attribute__((ext_vector_type(8)));
-    typedef T d16_t __attribute__((ext_vector_type(16)));
-    typedef T d32_t __attribute__((ext_vector_type(32)));
-    typedef T d64_t __attribute__((ext_vector_type(64)));
-    typedef T d128_t __attribute__((ext_vector_type(128)));
-    typedef T d256_t __attribute__((ext_vector_type(256)));
-
-    using type = d256_t;
-
-    union
-    {
-        d256_t d256_;
-        StaticallyIndexedArray<d1_t, 256> d1x256_;
-        StaticallyIndexedArray<d2_t, 128> d2x128_;
-        StaticallyIndexedArray<d4_t, 64> d4x64_;
-        StaticallyIndexedArray<d8_t, 32> d8x32_;
-        StaticallyIndexedArray<d16_t, 16> d16x16_;
-        StaticallyIndexedArray<d32_t, 8> d32x8_;
-        StaticallyIndexedArray<d64_t, 4> d64x4_;
-        StaticallyIndexedArray<d128_t, 2> d128x2_;
-        StaticallyIndexedArray<d256_t, 1> d256x1_;
-    } data_;
-
-    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
-
-    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
-
-    template <typename X>
-    __host__ __device__ constexpr const auto& AsType() const
-    {
-        static_assert(
-            is_same<X, d1_t>::value || is_same<X, d2_t>::value || is_same<X, d4_t>::value ||
-                is_same<X, d8_t>::value || is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
-                is_same<X, d64_t>::value || is_same<X, d128_t>::value || is_same<X, d256_t>::value,
-            "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x256_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x128_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x64_;
-        }
-        else if constexpr(is_same<X, d8_t>::value)
-        {
-            return data_.d8x32_;
-        }
-        else if constexpr(is_same<X, d16_t>::value)
-        {
-            return data_.d16x16_;
-        }
-        else if constexpr(is_same<X, d32_t>::value)
-        {
-            return data_.d32x8_;
-        }
-        else if constexpr(is_same<X, d64_t>::value)
-        {
-            return data_.d64x4_;
-        }
-        else if constexpr(is_same<X, d128_t>::value)
-        {
-            return data_.d128x2_;
-        }
-        else if constexpr(is_same<X, d256_t>::value)
-        {
-            return data_.d256x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-
-    template <typename X>
-    __host__ __device__ constexpr auto& AsType()
-    {
-        static_assert(
-            is_same<X, d1_t>::value || is_same<X, d2_t>::value || is_same<X, d4_t>::value ||
-                is_same<X, d8_t>::value || is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
-                is_same<X, d64_t>::value || is_same<X, d128_t>::value || is_same<X, d256_t>::value,
-            "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x256_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x128_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x64_;
-        }
-        else if constexpr(is_same<X, d8_t>::value)
-        {
-            return data_.d8x32_;
-        }
-        else if constexpr(is_same<X, d16_t>::value)
-        {
-            return data_.d16x16_;
-        }
-        else if constexpr(is_same<X, d32_t>::value)
-        {
-            return data_.d32x8_;
-        }
-        else if constexpr(is_same<X, d64_t>::value)
-        {
-            return data_.d64x4_;
-        }
-        else if constexpr(is_same<X, d128_t>::value)
-        {
-            return data_.d128x2_;
-        }
-        else if constexpr(is_same<X, d256_t>::value)
-        {
-            return data_.d256x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-};
-
-template <typename T, index_t N, typename Enable = void>
-struct non_native_vector_base;
-
-template <typename T>
-struct nnvb_data_t_selector
-{
-    using type = unsigned _BitInt(8 * sizeof(T));
-};
-
 template <>
-struct nnvb_data_t_selector<f8_ocp_t>
+struct scalar_type<bool>
 {
-    using type = f8_ocp_t::data_type;
+    using type                           = bool;
+    static constexpr index_t vector_size = 1;
 };
 
-template <>
-struct nnvb_data_t_selector<bf8_ocp_t>
-{
-    using type = bf8_ocp_t::data_type;
-};
-
-template <>
-struct nnvb_data_t_selector<f6x16_pk_t>
-{
-    using type = f6x16_pk_t::type;
-};
-
-template <>
-struct nnvb_data_t_selector<f6x32_pk_t>
-{
-    using type = f6x32_pk_t::type;
-};
-
-template <>
-struct nnvb_data_t_selector<bf6x16_pk_t>
-{
-    using type = bf6x16_pk_t::type;
-};
-
-template <>
-struct nnvb_data_t_selector<bf6x32_pk_t>
-{
-    using type = bf6x32_pk_t::type;
-};
-
-template <>
-struct nnvb_data_t_selector<pk_i4_t>
-{
-    using type = pk_i4_t::type;
-};
-
-template <typename T, index_t N>
-struct non_native_vector_base<
-    T,
-    N,
-    ck::enable_if_t<sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8>>
-{
-    using data_t = typename nnvb_data_t_selector<T>::type; // select data_t based on the size of T
-    static_assert(sizeof(T) == sizeof(data_t), "non_native_vector_base storage size mismatch");
-    using data_v = data_t __attribute__((ext_vector_type(N)));
-    using type   = non_native_vector_base<T, N>;
-
-    union alignas(next_pow2(N * sizeof(T)))
-    {
-        data_v dN; // storage vector;
-        StaticallyIndexedArray<data_t, N> dxN;
-        StaticallyIndexedArray<T, N> dTxN;
-        StaticallyIndexedArray<data_v, 1> dNx1;
-    } data_;
-
-    __host__ __device__ constexpr non_native_vector_base(data_t a) : data_{data_v(a)} {}
-    __host__ __device__ constexpr non_native_vector_base(T f)
-        : non_native_vector_base(bit_cast<data_t>(f))
-    {
-    }
-    __host__ __device__ constexpr non_native_vector_base() : non_native_vector_base(T{}){};
-    __host__ __device__ constexpr non_native_vector_base(data_v v) : data_{v} {}
-
-    __host__ __device__ constexpr operator data_v() const { return data_.dN; }
-    __host__ __device__ constexpr operator data_t() const
-    {
-        if constexpr(N == 1)
-        {
-            return data_.dxN[Number<0>{}];
-        }
-        else
-        {
-            return data_.dxN; // XXX this should cause an error
-        }
-    }
-    __host__ __device__ constexpr operator T() const
-    {
-        if constexpr(N == 1)
-        {
-            return data_.dTxN[Number<0>{}];
-        }
-        else
-        {
-            return data_.dTxN; // XXX this should cause an error
-        }
-    }
-
-    template <typename X>
-    __host__ __device__ constexpr const auto& AsType() const
-    {
-        static_assert(is_same_v<X, data_t> || is_same_v<X, T> || is_same_v<X, data_v>,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same_v<X, data_t>)
-        {
-            return data_.dxN;
-        }
-        else if constexpr(is_same_v<X, T>)
-        {
-            return data_.dTxN;
-        }
-        else if constexpr(is_same_v<X, data_v>)
-        {
-            return data_.dNx1;
-        }
-        else
-        {
-            return err;
-        }
-    }
-
-    template <typename X>
-    __host__ __device__ constexpr auto& AsType()
-    {
-        static_assert(is_same_v<X, data_t> || is_same_v<X, T> || is_same_v<X, data_v>,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same_v<X, data_t>)
-        {
-            return data_.dxN;
-        }
-        else if constexpr(is_same_v<X, T>)
-        {
-            return data_.dTxN;
-        }
-        else if constexpr(is_same_v<X, data_v>)
-        {
-            return data_.dNx1;
-        }
-        else
-        {
-            return err;
-        }
-    }
-};
-
-// implementation for f6x16 and f6x32
-template <typename T, index_t N>
-struct non_native_vector_base<T, N, std::enable_if_t<sizeof(T) == 12 || sizeof(T) == 24>>
-{
-    using data_t =
-        typename nnvb_data_t_selector<T>::type; // select data_t based on declared base type
-    using element_t = typename T::element_type; // select element_t based on declared element type
-    static_assert(sizeof(T) == sizeof(data_t), "non_native_vector_base storage size mismatch");
-    static constexpr size_t size_factor =
-        sizeof(data_t) / sizeof(element_t); // f6x16: 12/4 = 3, f6x32: 24/4 = 6
-    using data_v = element_t __attribute__((ext_vector_type(N * size_factor)));
-    using type   = non_native_vector_base<T, N>;
-
-    union alignas(next_pow2(N * sizeof(T)))
-    {
-        data_v dN; // storage vector;
-        StaticallyIndexedArray<data_t, N> dxN;
-        StaticallyIndexedArray<T, N> dTxN;
-        StaticallyIndexedArray<data_v, 1> dNx1;
-    } data_;
-
-    __host__ __device__ constexpr non_native_vector_base(data_t a)
-        : data_{data_v(a.At(Number<0>{}))}
-    {
-    }
-    __host__ __device__ constexpr non_native_vector_base(T f)
-        : non_native_vector_base(bit_cast<data_t>(f))
-    {
-    }
-    __host__ __device__ constexpr non_native_vector_base() : non_native_vector_base(T{}){};
-    __host__ __device__ constexpr non_native_vector_base(data_v v) : data_{v} {}
-
-    __host__ __device__ constexpr operator data_v() const { return data_.dN; }
-    __host__ __device__ constexpr operator data_t() const
-    {
-        if constexpr(N == 1)
-        {
-            return data_.dxN[Number<0>{}];
-        }
-        else
-        {
-            return data_.dxN; // XXX this should cause an error
-        }
-    }
-    __host__ __device__ constexpr operator T() const
-    {
-        if constexpr(N == 1)
-        {
-            return data_.dTxN[Number<0>{}];
-        }
-        else
-        {
-            return data_.dTxN; // XXX this should cause an error
-        }
-    }
-};
-
-template <typename T, index_t N>
-struct scalar_type<non_native_vector_base<T, N>>;
-
-template <index_t N>
-struct scalar_type<non_native_vector_base<f8_ocp_t, N>>
-{
-    using type = typename non_native_vector_base<f8_ocp_t, N>::data_t;
-
-    static constexpr index_t vector_size = N;
-};
-
-template <index_t N>
-struct scalar_type<non_native_vector_base<bf8_ocp_t, N>>
-{
-    using type = typename non_native_vector_base<bf8_ocp_t, N>::data_t;
-
-    static constexpr index_t vector_size = N;
-};
-
-template <index_t N>
-struct scalar_type<non_native_vector_base<pk_i4_t, N>>
-{
-    using type = typename non_native_vector_base<pk_i4_t, N>::data_t;
-
-    static constexpr index_t vector_size = N;
-};
-
-template <index_t N>
-struct scalar_type<non_native_vector_base<f4x2_pk_t, N>>
-{
-    using type = typename non_native_vector_base<f4x2_pk_t, N>::data_t;
-
-    static constexpr index_t vector_size = N;
-};
-
-// non-native vector_type implementation
-template <typename T>
-struct vector_type<T, 1, typename ck::enable_if_t<!is_native_type<T>()>>
-{
-    using d1_t     = T;
-    using d1_nnv_t = non_native_vector_base<T, 1>;
-    using type     = d1_nnv_t;
-
-    union alignas(next_pow2(1 * sizeof(T)))
-    {
-        d1_t d1_;
-        StaticallyIndexedArray<d1_t, 1> d1x1_;
-        d1_nnv_t d1_nnv_;
-    } data_;
-
-    __host__ __device__ constexpr vector_type() : data_{d1_t{}} {}
-
-    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
-
-    template <typename X>
-    __host__ __device__ constexpr const auto& AsType() const
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
-        {
-            return data_.d1x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-
-    template <typename X>
-    __host__ __device__ constexpr auto& AsType()
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
-        {
-            return data_.d1x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-};
-
-template <typename T>
-struct vector_type<T, 2, typename ck::enable_if_t<!is_native_type<T>()>>
-{
-    using d1_t     = T;
-    using d1_nnv_t = non_native_vector_base<T, 1>;
-    using d2_t     = non_native_vector_base<T, 2>;
-
-    using type = d2_t;
-
-    union alignas(next_pow2(2 * sizeof(T)))
-    {
-        d2_t d2_;
-        StaticallyIndexedArray<d1_t, 2> d1x2_;
-        StaticallyIndexedArray<d2_t, 1> d2x1_;
-    } data_;
-
-    __host__ __device__ constexpr vector_type() : data_{type{}} {}
-
-    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
-
-    template <typename X>
-    __host__ __device__ constexpr const auto& AsType() const
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
-                          is_same<X, d2_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
-        {
-            return data_.d1x2_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-
-    template <typename X>
-    __host__ __device__ constexpr auto& AsType()
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
-                          is_same<X, d2_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
-        {
-            return data_.d1x2_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-};
-
-template <typename T>
-struct vector_type<T, 4, typename ck::enable_if_t<!is_native_type<T>()>>
-{
-    using d1_t     = T;
-    using d1_nnv_t = non_native_vector_base<T, 1>;
-    using d2_t     = non_native_vector_base<T, 2>;
-    using d4_t     = non_native_vector_base<T, 4>;
-
-    using type = d4_t;
-
-    union alignas(next_pow2(4 * sizeof(T)))
-    {
-        d4_t d4_;
-        StaticallyIndexedArray<d1_t, 4> d1x4_;
-        StaticallyIndexedArray<d2_t, 2> d2x2_;
-        StaticallyIndexedArray<d4_t, 1> d4x1_;
-    } data_;
-
-    __host__ __device__ constexpr vector_type() : data_{type{}} {}
-
-    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
-
-    template <typename X>
-    __host__ __device__ constexpr const auto& AsType() const
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
-                          is_same<X, d2_t>::value || is_same<X, d4_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
-        {
-            return data_.d1x4_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x2_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-
-    template <typename X>
-    __host__ __device__ constexpr auto& AsType()
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
-                          is_same<X, d2_t>::value || is_same<X, d4_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
-        {
-            return data_.d1x4_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x2_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-};
-
-template <typename T>
-struct vector_type<T, 8, typename ck::enable_if_t<!is_native_type<T>()>>
-{
-    using d1_t     = T;
-    using d1_nnv_t = non_native_vector_base<T, 1>;
-    using d2_t     = non_native_vector_base<T, 2>;
-    using d4_t     = non_native_vector_base<T, 4>;
-    using d8_t     = non_native_vector_base<T, 8>;
-
-    using type = d8_t;
-
-    union alignas(next_pow2(8 * sizeof(T)))
-    {
-        d8_t d8_;
-        StaticallyIndexedArray<d1_t, 8> d1x8_;
-        StaticallyIndexedArray<d2_t, 4> d2x4_;
-        StaticallyIndexedArray<d4_t, 2> d4x2_;
-        StaticallyIndexedArray<d8_t, 1> d8x1_;
-    } data_;
-
-    __host__ __device__ constexpr vector_type() : data_{type{}} {}
-
-    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
-
-    template <typename X>
-    __host__ __device__ constexpr const auto& AsType() const
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
-                          is_same<X, d2_t>::value || is_same<X, d4_t>::value ||
-                          is_same<X, d8_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
-        {
-            return data_.d1x8_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x4_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x2_;
-        }
-        else if constexpr(is_same<X, d8_t>::value)
-        {
-            return data_.d8x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-
-    template <typename X>
-    __host__ __device__ constexpr auto& AsType()
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
-                          is_same<X, d2_t>::value || is_same<X, d4_t>::value ||
-                          is_same<X, d8_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
-        {
-            return data_.d1x8_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x4_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x2_;
-        }
-        else if constexpr(is_same<X, d8_t>::value)
-        {
-            return data_.d8x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-};
-
-template <typename T>
-struct vector_type<T, 16, typename ck::enable_if_t<!is_native_type<T>()>>
-{
-    using d1_t     = T;
-    using d1_nnv_t = non_native_vector_base<T, 1>;
-    using d2_t     = non_native_vector_base<T, 2>;
-    using d4_t     = non_native_vector_base<T, 4>;
-    using d8_t     = non_native_vector_base<T, 8>;
-    using d16_t    = non_native_vector_base<T, 16>;
-
-    using type = d16_t;
-
-    union alignas(next_pow2(16 * sizeof(T)))
-    {
-        d16_t d16_;
-        StaticallyIndexedArray<d1_t, 16> d1x16_;
-        StaticallyIndexedArray<d2_t, 8> d2x8_;
-        StaticallyIndexedArray<d4_t, 4> d4x4_;
-        StaticallyIndexedArray<d8_t, 2> d8x2_;
-        StaticallyIndexedArray<d16_t, 1> d16x1_;
-    } data_;
-
-    __host__ __device__ constexpr vector_type() : data_{type{}} {}
-
-    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
-
-    template <typename X>
-    __host__ __device__ constexpr const auto& AsType() const
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
-                          is_same<X, d2_t>::value || is_same<X, d4_t>::value ||
-                          is_same<X, d8_t>::value || is_same<X, d16_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
-        {
-            return data_.d1x16_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x8_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x4_;
-        }
-        else if constexpr(is_same<X, d8_t>::value)
-        {
-            return data_.d8x2_;
-        }
-        else if constexpr(is_same<X, d16_t>::value)
-        {
-            return data_.d16x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-
-    template <typename X>
-    __host__ __device__ constexpr auto& AsType()
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
-                          is_same<X, d2_t>::value || is_same<X, d4_t>::value ||
-                          is_same<X, d8_t>::value || is_same<X, d16_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
-        {
-            return data_.d1x16_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x8_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x4_;
-        }
-        else if constexpr(is_same<X, d8_t>::value)
-        {
-            return data_.d8x2_;
-        }
-        else if constexpr(is_same<X, d16_t>::value)
-        {
-            return data_.d16x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-};
-
-template <typename T>
-struct vector_type<T, 32, typename ck::enable_if_t<!is_native_type<T>()>>
-{
-    using d1_t  = T;
-    using d2_t  = non_native_vector_base<T, 2>;
-    using d4_t  = non_native_vector_base<T, 4>;
-    using d8_t  = non_native_vector_base<T, 8>;
-    using d16_t = non_native_vector_base<T, 16>;
-    using d32_t = non_native_vector_base<T, 32>;
-
-    using type = d32_t;
-
-    union alignas(next_pow2(32 * sizeof(T)))
-    {
-        d32_t d32_;
-        StaticallyIndexedArray<d1_t, 32> d1x32_;
-        StaticallyIndexedArray<d2_t, 16> d2x16_;
-        StaticallyIndexedArray<d4_t, 8> d4x8_;
-        StaticallyIndexedArray<d8_t, 4> d8x4_;
-        StaticallyIndexedArray<d16_t, 2> d16x2_;
-        StaticallyIndexedArray<d32_t, 1> d32x1_;
-    } data_;
-
-    __host__ __device__ constexpr vector_type() : data_{type{}} {}
-
-    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
-
-    template <typename X>
-    __host__ __device__ constexpr const auto& AsType() const
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
-                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
-                          is_same<X, d16_t>::value || is_same<X, d32_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x32_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x16_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x8_;
-        }
-        else if constexpr(is_same<X, d8_t>::value)
-        {
-            return data_.d8x4_;
-        }
-        else if constexpr(is_same<X, d16_t>::value)
-        {
-            return data_.d16x2_;
-        }
-        else if constexpr(is_same<X, d32_t>::value)
-        {
-            return data_.d32x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-
-    template <typename X>
-    __host__ __device__ constexpr auto& AsType()
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
-                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
-                          is_same<X, d16_t>::value || is_same<X, d32_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x32_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x16_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x8_;
-        }
-        else if constexpr(is_same<X, d8_t>::value)
-        {
-            return data_.d8x4_;
-        }
-        else if constexpr(is_same<X, d16_t>::value)
-        {
-            return data_.d16x2_;
-        }
-        else if constexpr(is_same<X, d32_t>::value)
-        {
-            return data_.d32x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-};
-
-template <typename T>
-struct vector_type<T, 64, typename ck::enable_if_t<!is_native_type<T>()>>
-{
-    using d1_t  = T;
-    using d2_t  = non_native_vector_base<T, 2>;
-    using d4_t  = non_native_vector_base<T, 4>;
-    using d8_t  = non_native_vector_base<T, 8>;
-    using d16_t = non_native_vector_base<T, 16>;
-    using d32_t = non_native_vector_base<T, 32>;
-    using d64_t = non_native_vector_base<T, 64>;
-
-    using type = d64_t;
-
-    union alignas(next_pow2(64 * sizeof(T)))
-    {
-        d64_t d64_;
-        StaticallyIndexedArray<d1_t, 64> d1x64_;
-        StaticallyIndexedArray<d2_t, 32> d2x32_;
-        StaticallyIndexedArray<d4_t, 16> d4x16_;
-        StaticallyIndexedArray<d8_t, 8> d8x8_;
-        StaticallyIndexedArray<d16_t, 4> d16x4_;
-        StaticallyIndexedArray<d32_t, 2> d32x2_;
-        StaticallyIndexedArray<d64_t, 1> d64x1_;
-    } data_;
-
-    __host__ __device__ constexpr vector_type() : data_{type{}} {}
-
-    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
-
-    template <typename X>
-    __host__ __device__ constexpr const auto& AsType() const
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
-                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
-                          is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
-                          is_same<X, d64_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x64_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x32_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x16_;
-        }
-        else if constexpr(is_same<X, d8_t>::value)
-        {
-            return data_.d8x8_;
-        }
-        else if constexpr(is_same<X, d16_t>::value)
-        {
-            return data_.d16x4_;
-        }
-        else if constexpr(is_same<X, d32_t>::value)
-        {
-            return data_.d32x2_;
-        }
-        else if constexpr(is_same<X, d64_t>::value)
-        {
-            return data_.d64x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-
-    template <typename X>
-    __host__ __device__ constexpr auto& AsType()
-    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
-                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
-                          is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
-                          is_same<X, d64_t>::value,
-                      "Something went wrong, please check src and dst types.");
-
-        if constexpr(is_same<X, d1_t>::value)
-        {
-            return data_.d1x64_;
-        }
-        else if constexpr(is_same<X, d2_t>::value)
-        {
-            return data_.d2x32_;
-        }
-        else if constexpr(is_same<X, d4_t>::value)
-        {
-            return data_.d4x16_;
-        }
-        else if constexpr(is_same<X, d8_t>::value)
-        {
-            return data_.d8x8_;
-        }
-        else if constexpr(is_same<X, d16_t>::value)
-        {
-            return data_.d16x4_;
-        }
-        else if constexpr(is_same<X, d32_t>::value)
-        {
-            return data_.d32x2_;
-        }
-        else if constexpr(is_same<X, d64_t>::value)
-        {
-            return data_.d64x1_;
-        }
-        else
-        {
-            return err;
-        }
-    }
-};
-
-using int64_t = long;
-
-// fp64
-using double2_t = typename vector_type<double, 2>::type;
-using double4_t = typename vector_type<double, 4>::type;
-
-// fp32
-using float2_t  = typename vector_type<float, 2>::type;
-using float4_t  = typename vector_type<float, 4>::type;
-using float8_t  = typename vector_type<float, 8>::type;
-using float16_t = typename vector_type<float, 16>::type;
-using float32_t = typename vector_type<float, 32>::type;
-using float64_t = typename vector_type<float, 64>::type;
-
-// fp16
-using half2_t  = typename vector_type<half_t, 2>::type;
-using half4_t  = typename vector_type<half_t, 4>::type;
-using half8_t  = typename vector_type<half_t, 8>::type;
-using half16_t = typename vector_type<half_t, 16>::type;
-using half32_t = typename vector_type<half_t, 32>::type;
-using half64_t = typename vector_type<half_t, 64>::type;
-
-// bfp16
-using bhalf2_t  = typename vector_type<bhalf_t, 2>::type;
-using bhalf4_t  = typename vector_type<bhalf_t, 4>::type;
-using bhalf8_t  = typename vector_type<bhalf_t, 8>::type;
-using bhalf16_t = typename vector_type<bhalf_t, 16>::type;
-using bhalf32_t = typename vector_type<bhalf_t, 32>::type;
-using bhalf64_t = typename vector_type<bhalf_t, 64>::type;
-
-// i32
-using int32x2_t  = typename vector_type<int32_t, 2>::type;
-using int32x4_t  = typename vector_type<int32_t, 4>::type;
-using int32x8_t  = typename vector_type<int32_t, 8>::type;
-using int32x16_t = typename vector_type<int32_t, 16>::type;
-using int32x32_t = typename vector_type<int32_t, 32>::type;
-using int32x64_t = typename vector_type<int32_t, 64>::type;
-
-// i8
-using int8x2_t  = typename vector_type<int8_t, 2>::type;
-using int8x4_t  = typename vector_type<int8_t, 4>::type;
-using int8x8_t  = typename vector_type<int8_t, 8>::type;
-using int8x16_t = typename vector_type<int8_t, 16>::type;
-using int8x32_t = typename vector_type<int8_t, 32>::type;
-using int8x64_t = typename vector_type<int8_t, 64>::type;
-
-// f8
-using f8x2_fnuz_t  = typename vector_type<f8_fnuz_t, 2>::type;
-using f8x4_fnuz_t  = typename vector_type<f8_fnuz_t, 4>::type;
-using f8x8_fnuz_t  = typename vector_type<f8_fnuz_t, 8>::type;
-using f8x16_fnuz_t = typename vector_type<f8_fnuz_t, 16>::type;
-using f8x32_fnuz_t = typename vector_type<f8_fnuz_t, 32>::type;
-using f8x64_fnuz_t = typename vector_type<f8_fnuz_t, 64>::type;
-
-// bf8
-using bf8x2_fnuz_t  = typename vector_type<bf8_fnuz_t, 2>::type;
-using bf8x4_fnuz_t  = typename vector_type<bf8_fnuz_t, 4>::type;
-using bf8x8_fnuz_t  = typename vector_type<bf8_fnuz_t, 8>::type;
-using bf8x16_fnuz_t = typename vector_type<bf8_fnuz_t, 16>::type;
-using bf8x32_fnuz_t = typename vector_type<bf8_fnuz_t, 32>::type;
-using bf8x64_fnuz_t = typename vector_type<bf8_fnuz_t, 64>::type;
-
-// f8
-using f8x2_ocp_t  = typename vector_type<f8_ocp_t, 2>::type;
-using f8x4_ocp_t  = typename vector_type<f8_ocp_t, 4>::type;
-using f8x8_ocp_t  = typename vector_type<f8_ocp_t, 8>::type;
-using f8x16_ocp_t = typename vector_type<f8_ocp_t, 16>::type;
-using f8x32_ocp_t = typename vector_type<f8_ocp_t, 32>::type;
-using f8x64_ocp_t = typename vector_type<f8_ocp_t, 64>::type;
-
-// bf8
-using bf8x2_ocp_t  = typename vector_type<bf8_ocp_t, 2>::type;
-using bf8x4_ocp_t  = typename vector_type<bf8_ocp_t, 4>::type;
-using bf8x8_ocp_t  = typename vector_type<bf8_ocp_t, 8>::type;
-using bf8x16_ocp_t = typename vector_type<bf8_ocp_t, 16>::type;
-using bf8x32_ocp_t = typename vector_type<bf8_ocp_t, 32>::type;
-using bf8x64_ocp_t = typename vector_type<bf8_ocp_t, 64>::type;
-
-#if CK_FP8_TYPE_OCP
-// f8
-using f8x2_t  = f8x2_ocp_t;
-using f8x4_t  = f8x4_ocp_t;
-using f8x8_t  = f8x8_ocp_t;
-using f8x16_t = f8x16_ocp_t;
-using f8x32_t = f8x32_ocp_t;
-using f8x64_t = f8x64_ocp_t;
-
-// bf8
-using bf8x2_t  = bf8x2_ocp_t;
-using bf8x4_t  = bf8x4_ocp_t;
-using bf8x8_t  = bf8x8_ocp_t;
-using bf8x16_t = bf8x16_ocp_t;
-using bf8x32_t = bf8x32_ocp_t;
-using bf8x64_t = bf8x64_ocp_t;
-#elif CK_FP8_TYPE_FNUZ
-// f8
-using f8x2_t  = f8x2_fnuz_t;
-using f8x4_t  = f8x4_fnuz_t;
-using f8x8_t  = f8x8_fnuz_t;
-using f8x16_t = f8x16_fnuz_t;
-using f8x32_t = f8x32_fnuz_t;
-using f8x64_t = f8x64_fnuz_t;
-
-// bf8
-using bf8x2_t  = bf8x2_fnuz_t;
-using bf8x4_t  = bf8x4_fnuz_t;
-using bf8x8_t  = bf8x8_fnuz_t;
-using bf8x16_t = bf8x16_fnuz_t;
-using bf8x32_t = bf8x32_fnuz_t;
-using bf8x64_t = bf8x64_fnuz_t;
-#endif
-
-// u8
-using uint8x2_t  = typename vector_type<uint8_t, 2>::type;
-using uint8x4_t  = typename vector_type<uint8_t, 4>::type;
-using uint8x8_t  = typename vector_type<uint8_t, 8>::type;
-using uint8x16_t = typename vector_type<uint8_t, 16>::type;
-using uint8x32_t = typename vector_type<uint8_t, 32>::type;
-using uint8x64_t = typename vector_type<uint8_t, 64>::type;
-
-// f4
-using f4x2_t  = typename vector_type<f4x2_pk_t, 1>::type;
-using f4x4_t  = typename vector_type<f4x2_pk_t, 2>::type;
-using f4x8_t  = typename vector_type<f4x2_pk_t, 4>::type;
-using f4x16_t = typename vector_type<f4x2_pk_t, 8>::type;
-using f4x32_t = typename vector_type<f4x2_pk_t, 16>::type;
-using f4x64_t = typename vector_type<f4x2_pk_t, 32>::type;
-
-// f6
-using f6x16_t = typename vector_type<f6x16_pk_t, 1>::type;
-using f6x32_t = typename vector_type<f6x32_pk_t, 1>::type;
-
-// bf6
-using bf6x16_t = typename vector_type<bf6x16_pk_t, 1>::type;
-using bf6x32_t = typename vector_type<bf6x32_pk_t, 1>::type;
-
-// pack int4
-using pk_i4x2_t = typename vector_type<pk_i4_t, 2>::type;
-using pk_i4x4_t = typename vector_type<pk_i4_t, 4>::type;
-using pk_i4x8_t = typename vector_type<pk_i4_t, 8>::type;
-
-#if defined(__HIPCC_RTC__) || defined(CK_CODE_GEN_RTC)
-template <typename T>
-struct NumericLimits;
-
-template <>
-struct NumericLimits<int32_t>
-{
-    __host__ __device__ static constexpr int32_t Lowest() noexcept { return -2147483647 - 1; }
-
-    __host__ __device__ static constexpr int32_t Min() noexcept { return -2147483647 - 1; }
-
-    __host__ __device__ static constexpr int32_t Max() noexcept { return 2147483647; }
-
-    __host__ __device__ static constexpr int32_t Infinity() noexcept { return 0; }
-
-    __host__ __device__ static constexpr int32_t QuietNaN() { return 0; }
-};
-template <>
-struct NumericLimits<int16_t>
-{
-    __host__ __device__ static constexpr int16_t Lowest() noexcept { return -32768; }
-
-    __host__ __device__ static constexpr int16_t Min() noexcept { return -32768; }
-
-    __host__ __device__ static constexpr int16_t Max() noexcept { return 32767; }
-
-    __host__ __device__ static constexpr int16_t Infinity() noexcept { return 0; }
-
-    __host__ __device__ static constexpr int16_t QuietNaN() { return 0; }
-};
-
-template <>
-struct NumericLimits<int8_t>
-{
-    __host__ __device__ static constexpr int8_t Lowest() noexcept { return -128; }
-
-    __host__ __device__ static constexpr int8_t Min() noexcept { return -128; }
-
-    __host__ __device__ static constexpr int8_t Max() noexcept { return 127; }
-
-    __host__ __device__ static constexpr int8_t Infinity() noexcept { return 0; }
-
-    __host__ __device__ static constexpr int8_t QuietNaN() { return 0; }
-};
-
-template <>
-struct NumericLimits<uint32_t>
-{
-    __host__ __device__ static constexpr uint32_t Lowest() noexcept { return 0; }
-
-    __host__ __device__ static constexpr uint32_t Min() noexcept { return 0; }
-
-    __host__ __device__ static constexpr uint32_t Max() noexcept { return 4294967295U; }
-
-    __host__ __device__ static constexpr uint32_t Infinity() noexcept { return 0; }
-
-    __host__ __device__ static constexpr uint32_t QuietNaN() { return 0; }
-};
-
-template <>
-struct NumericLimits<uint16_t>
-{
-    __host__ __device__ static constexpr uint16_t Lowest() noexcept { return 0; }
-
-    __host__ __device__ static constexpr uint16_t Min() noexcept { return 0; }
-
-    __host__ __device__ static constexpr uint16_t Max() noexcept { return 65535U; }
-
-    __host__ __device__ static constexpr uint16_t Infinity() noexcept { return 0; }
-
-    __host__ __device__ static constexpr uint16_t QuietNaN() { return 0; }
-};
-
-template <>
-struct NumericLimits<float>
-{
-    static constexpr unsigned int binary_min    = 0x00800000;
-    static constexpr unsigned int binary_max    = 0x7F7FFFFF;
-    static constexpr unsigned int binary_lowest = 0xFF7FFFFF;
-    static constexpr unsigned int binary_qnan   = 0xFFC00001;
-    static constexpr unsigned int binary_inf    = 0x7F8000000;
-
-    __host__ __device__ static constexpr float Min() { return bit_cast<float>(binary_min); }
-
-    __host__ __device__ static constexpr float Max() { return bit_cast<float>(binary_max); }
-
-    __host__ __device__ static constexpr float Lowest() { return bit_cast<float>(binary_lowest); }
-
-    __host__ __device__ static constexpr float QuietNaN() { return bit_cast<float>(binary_qnan); }
-
-    __host__ __device__ static constexpr float Infinity() { return bit_cast<float>(binary_inf); }
-};
-
-template <>
-struct NumericLimits<half_t>
-{
-    static constexpr unsigned short binary_min    = 0x0400;
-    static constexpr unsigned short binary_max    = 0x7BFF;
-    static constexpr unsigned short binary_lowest = 0xFBFF;
-    static constexpr unsigned short binary_qnan   = 0x7FFF;
-
-    __host__ __device__ static constexpr half_t Min() { return bit_cast<half_t>(binary_min); }
-
-    __host__ __device__ static constexpr half_t Max() { return bit_cast<half_t>(binary_max); }
-
-    __host__ __device__ static constexpr half_t Lowest() { return bit_cast<half_t>(binary_lowest); }
-
-    __host__ __device__ static constexpr half_t QuietNaN() { return bit_cast<half_t>(binary_qnan); }
-};
-
-#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
-template <>
-struct NumericLimits<int4_t>
-{
-    __host__ __device__ static constexpr int4_t Min() { return int4_t(-8); }
-
-    __host__ __device__ static constexpr int4_t Max() { return int4_t(7); }
-
-    __host__ __device__ static constexpr int4_t Lowest() { return int4_t(-8); }
-};
-#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
-
-template <>
-struct NumericLimits<f8_fnuz_t>
-{
-    // negative zero nan mode with exp bias = 8
-    static constexpr uint8_t binary_min    = 0x08; // 0b00001000
-    static constexpr uint8_t binary_max    = 0x7F; // 0b01111111
-    static constexpr uint8_t binary_lowest = 0xFF; // 0b11111111
-    static constexpr uint8_t binary_qnan   = 0x80; // 0b10000000
-    // ieee mode with exp bias = 7
-    // static constexpr uint8_t binary_min    = 0x08; // 0b00001000
-    // static constexpr uint8_t binary_max    = 0x77; // 0b01110111
-    // static constexpr uint8_t binary_lowest = 0xF7; // 0b11110111
-    // static constexpr uint8_t binary_qnan   = 0x79; // any sign, exp=1111, mant!=0
-
-    __host__ __device__ static constexpr f8_fnuz_t Min() { return f8_fnuz_t(binary_min); }
-
-    __host__ __device__ static constexpr f8_fnuz_t Max() { return f8_fnuz_t(binary_max); }
-
-    __host__ __device__ static constexpr f8_fnuz_t Lowest() { return f8_fnuz_t(binary_lowest); }
-
-    __host__ __device__ static constexpr f8_fnuz_t QuietNaN() { return f8_fnuz_t(binary_qnan); }
-};
-
-template <>
-struct NumericLimits<bf8_fnuz_t>
-{
-    // negative zero nan mode with exp bias = 16
-    static constexpr uint8_t binary_min    = 0x04; // 0b00000100
-    static constexpr uint8_t binary_max    = 0x7F; // 0b01111111
-    static constexpr uint8_t binary_lowest = 0xFF; // 0b11111111
-    static constexpr uint8_t binary_qnan   = 0x80; // 0b10000000
-    // ieee mode with exp bias = 15
-    // static constexpr uint8_t binary_min    = 0x04; // 0b00000100
-    // static constexpr uint8_t binary_max    = 0x7B; // 0b01111011
-    // static constexpr uint8_t binary_lowest = 0xFB; // 0b11111011
-    // static constexpr uint8_t binary_qnan   = 0x79; // any sign, exp=1111, mant!=
-
-    __host__ __device__ static constexpr bf8_fnuz_t Min() { return bf8_fnuz_t(binary_min); }
-
-    __host__ __device__ static constexpr bf8_fnuz_t Max() { return bf8_fnuz_t(binary_max); }
-
-    __host__ __device__ static constexpr bf8_fnuz_t Lowest() { return bf8_fnuz_t(binary_lowest); }
-
-    __host__ __device__ static constexpr bf8_fnuz_t QuietNaN() { return bf8_fnuz_t(binary_qnan); }
-};
-
-template <>
-struct NumericLimits<f8_ocp_t>
-{
-    static constexpr uint8_t binary_min    = 0x08; // 0b00001000 = 2^-6
-    static constexpr uint8_t binary_max    = 0x7E; // 0b01111110 = 448
-    static constexpr uint8_t binary_lowest = 0xFE; // 0b11111110 = -448
-    static constexpr uint8_t binary_qnan   = 0x7F; // 0b01111111
-
-    __host__ __device__ static constexpr f8_ocp_t Min() { return bit_cast<f8_ocp_t>(binary_min); }
-
-    __host__ __device__ static constexpr f8_ocp_t Max() { return bit_cast<f8_ocp_t>(binary_max); }
-
-    __host__ __device__ static constexpr f8_ocp_t Lowest()
-    {
-        return bit_cast<f8_ocp_t>(binary_lowest);
-    }
-
-    __host__ __device__ static constexpr f8_ocp_t QuietNaN()
-    {
-        return bit_cast<f8_ocp_t>(binary_qnan);
-    }
-};
-
-template <>
-struct NumericLimits<bf8_ocp_t>
-{
-    static constexpr uint8_t binary_min    = 0x04; // 0b00000100 = 2^-14
-    static constexpr uint8_t binary_max    = 0x7B; // 0b01111011 = 57344
-    static constexpr uint8_t binary_lowest = 0xFB; // 0b11111011 = -57344
-    static constexpr uint8_t binary_qnan   = 0x7D; // 0b01111101
-
-    __host__ __device__ static constexpr bf8_ocp_t Min() { return bit_cast<bf8_ocp_t>(binary_min); }
-
-    __host__ __device__ static constexpr bf8_ocp_t Max() { return bit_cast<bf8_ocp_t>(binary_max); }
-
-    __host__ __device__ static constexpr bf8_ocp_t Lowest()
-    {
-        return bit_cast<bf8_ocp_t>(binary_lowest);
-    }
-
-    __host__ __device__ static constexpr bf8_ocp_t QuietNaN()
-    {
-        return bit_cast<bf8_ocp_t>(binary_qnan);
-    }
-};
-
-template <>
-struct NumericLimits<f4_t>
-{
-    static constexpr uint8_t binary_min_normal    = 0x2; // 0b0010
-    static constexpr uint8_t binary_max_normal    = 0x7; // 0b0111
-    static constexpr uint8_t binary_lowest_normal = 0xF; // 0b1111
-    static constexpr uint8_t binary_min_subnorm   = 0x1; // 0b0001
-    static constexpr uint8_t binary_max_subnorm   = 0x1; // 0b0001
-
-    static constexpr float data_max_normal_number    = 6;
-    static constexpr float data_min_subnormal_number = 0.5;
-
-    __host__ __device__ static constexpr f4_t Min() { return f4_t(binary_min_normal); }
-    __host__ __device__ static constexpr f4_t Max() { return f4_t(binary_max_normal); }
-    __host__ __device__ static constexpr f4_t Lowest() { return f4_t(binary_lowest_normal); }
-    __host__ __device__ static constexpr f4_t MinSubnorm() { return f4_t(binary_min_subnorm); }
-    __host__ __device__ static constexpr f4_t MaxSubnorm() { return f4_t(binary_max_subnorm); }
-
-    __host__ __device__ static constexpr float DataMaxNorm() { return data_max_normal_number; }
-    __host__ __device__ static constexpr float DataMinSubnorm()
-    {
-        return data_min_subnormal_number;
-    }
-};
-
-template <>
-struct NumericLimits<f6_t>
-{
-    static constexpr uint8_t binary_min_normal    = 0x08; // 0b001000
-    static constexpr uint8_t binary_max_normal    = 0x1F; // 0b011111
-    static constexpr uint8_t binary_lowest_normal = 0x3F; // 0b111111
-    static constexpr uint8_t binary_min_subnorm   = 0x01; // 0b000001
-    static constexpr uint8_t binary_max_subnorm   = 0x07; // 0b000111
-
-    static constexpr float data_max_normal_number    = 7.5;
-    static constexpr float data_min_subnormal_number = 0.125;
-
-    __host__ __device__ static constexpr f6_t Min() { return f6_t(binary_min_normal & 0b111111); }
-    __host__ __device__ static constexpr f6_t Max() { return f6_t(binary_max_normal & 0b111111); }
-    __host__ __device__ static constexpr f6_t Lowest()
-    {
-        return f6_t(binary_lowest_normal & 0b111111);
-    }
-    __host__ __device__ static constexpr f6_t MinSubnorm()
-    {
-        return f6_t(binary_min_subnorm & 0b111111);
-    }
-    __host__ __device__ static constexpr f6_t MaxSubnorm()
-    {
-        return f6_t(binary_max_subnorm & 0b111111);
-    }
-
-    __host__ __device__ static constexpr float DataMaxNorm() { return data_max_normal_number; }
-    __host__ __device__ static constexpr float DataMinSubnorm()
-    {
-        return data_min_subnormal_number;
-    }
-};
-
-template <>
-struct NumericLimits<bf6_t>
-{
-    static constexpr uint8_t binary_min_normal    = 0x08; // 0b001000
-    static constexpr uint8_t binary_max_normal    = 0x1F; // 0b011111
-    static constexpr uint8_t binary_lowest_normal = 0x3F; // 0b111111
-    static constexpr uint8_t binary_min_subnorm   = 0x01; // 0b000001
-    static constexpr uint8_t binary_max_subnorm   = 0x03; // 0b000011
-
-    static constexpr float data_max_normal_number    = 28;
-    static constexpr float data_min_subnormal_number = 0.0625;
-
-    __host__ __device__ static constexpr bf6_t Min() { return bf6_t(binary_min_normal); }
-    __host__ __device__ static constexpr bf6_t Max() { return bf6_t(binary_max_normal); }
-    __host__ __device__ static constexpr bf6_t Lowest() { return bf6_t(binary_lowest_normal); }
-    __host__ __device__ static constexpr bf6_t MinSubnorm() { return bf6_t(binary_min_subnorm); }
-    __host__ __device__ static constexpr bf6_t MaxSubnorm() { return bf6_t(binary_max_subnorm); }
-
-    __host__ __device__ static constexpr float DataMaxNorm() { return data_max_normal_number; }
-    __host__ __device__ static constexpr float DataMinSubnorm()
-    {
-        return data_min_subnormal_number;
-    }
-};
-
-template <>
-struct NumericLimits<e8m0_bexp_t>
-{
-    static constexpr e8m0_bexp_t binary_min  = 0x00; // 0b00000000
-    static constexpr e8m0_bexp_t binary_max  = 0xFE; // 0b11111110
-    static constexpr e8m0_bexp_t binary_qnan = 0xFF; // 0b11111111
-    static constexpr e8m0_bexp_t binary_1    = 0x7F; // 0b01111111
-    static constexpr e8m0_bexp_t binary_2    = 0x80; // 0b10000000
-    static constexpr e8m0_bexp_t binary_3    = 0x82; // 0b10000010
-    static constexpr e8m0_bexp_t binary_135  = 0x87; // 0b10000111
-    static constexpr e8m0_bexp_t binary_142  = 0x8E; // 0b10001110
-
-    __host__ __device__ static constexpr e8m0_bexp_t Min() { return e8m0_bexp_t(binary_min); }
-    __host__ __device__ static constexpr e8m0_bexp_t Max() { return e8m0_bexp_t(binary_max); }
-    __host__ __device__ static constexpr e8m0_bexp_t QuietNaN() { return e8m0_bexp_t(binary_qnan); }
-    __host__ __device__ static constexpr e8m0_bexp_t Binary_1() { return e8m0_bexp_t(binary_1); }
-    __host__ __device__ static constexpr e8m0_bexp_t Binary_2() { return e8m0_bexp_t(binary_2); }
-    __host__ __device__ static constexpr e8m0_bexp_t Binary_3() { return e8m0_bexp_t(binary_3); }
-    __host__ __device__ static constexpr e8m0_bexp_t Binary_135()
-    {
-        return e8m0_bexp_t(binary_135);
-    }
-    __host__ __device__ static constexpr e8m0_bexp_t Binary_142()
-    {
-        return e8m0_bexp_t(binary_142);
-    }
-};
+#if defined(_WIN32)
+using int64_t = long long;
 #else
-template <typename T>
-struct NumericLimits
-{
-    __host__ __device__ static constexpr T Min() { return std::numeric_limits<T>::min(); }
-    __host__ __device__ static constexpr T Max() { return std::numeric_limits<T>::max(); }
-    __host__ __device__ static constexpr T Lowest() { return std::numeric_limits<T>::lowest(); }
-    __host__ __device__ static constexpr T QuietNaN()
-    {
-        return std::numeric_limits<T>::quiet_NaN();
-    }
-    __host__ __device__ static constexpr T Infinity() { return std::numeric_limits<T>::infinity(); }
-};
-
-template <>
-struct NumericLimits<half_t>
-{
-    static constexpr unsigned short binary_min    = 0x0400;
-    static constexpr unsigned short binary_max    = 0x7BFF;
-    static constexpr unsigned short binary_lowest = 0xFBFF;
-    static constexpr unsigned short binary_qnan   = 0x7FFF;
-
-    __host__ __device__ static constexpr half_t Min() { return bit_cast<half_t>(binary_min); }
-
-    __host__ __device__ static constexpr half_t Max() { return bit_cast<half_t>(binary_max); }
-
-    __host__ __device__ static constexpr half_t Lowest() { return bit_cast<half_t>(binary_lowest); }
-
-    __host__ __device__ static constexpr half_t QuietNaN() { return bit_cast<half_t>(binary_qnan); }
-};
-
-#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
-template <>
-struct NumericLimits<int4_t>
-{
-    __host__ __device__ static constexpr int4_t Min() { return int4_t(-8); }
-
-    __host__ __device__ static constexpr int4_t Max() { return int4_t(7); }
-
-    __host__ __device__ static constexpr int4_t Lowest() { return int4_t(-8); }
-};
-#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
-
-template <>
-struct NumericLimits<f8_fnuz_t>
-{
-    // negative zero nan mode with exp bias = 8
-    static constexpr uint8_t binary_min    = 0x08; // 0b00001000
-    static constexpr uint8_t binary_max    = 0x7F; // 0b01111111
-    static constexpr uint8_t binary_lowest = 0xFF; // 0b11111111
-    static constexpr uint8_t binary_qnan   = 0x80; // 0b10000000
-    // ieee mode with exp bias = 7
-    // static constexpr uint8_t binary_min    = 0x08; // 0b00001000
-    // static constexpr uint8_t binary_max    = 0x77; // 0b01110111
-    // static constexpr uint8_t binary_lowest = 0xF7; // 0b11110111
-    // static constexpr uint8_t binary_qnan   = 0x79; // any sign, exp=1111, mant!=0
-
-    __host__ __device__ static constexpr f8_fnuz_t Min() { return f8_fnuz_t(binary_min); }
-
-    __host__ __device__ static constexpr f8_fnuz_t Max() { return f8_fnuz_t(binary_max); }
-
-    __host__ __device__ static constexpr f8_fnuz_t Lowest() { return f8_fnuz_t(binary_lowest); }
-
-    __host__ __device__ static constexpr f8_fnuz_t QuietNaN() { return f8_fnuz_t(binary_qnan); }
-};
-
-template <>
-struct NumericLimits<bf8_fnuz_t>
-{
-    // negative zero nan mode with exp bias = 16
-    static constexpr uint8_t binary_min    = 0x04; // 0b00000100
-    static constexpr uint8_t binary_max    = 0x7F; // 0b01111111
-    static constexpr uint8_t binary_lowest = 0xFF; // 0b11111111
-    static constexpr uint8_t binary_qnan   = 0x80; // 0b10000000
-    // ieee mode with exp bias = 15
-    // static constexpr uint8_t binary_min    = 0x04; // 0b00000100
-    // static constexpr uint8_t binary_max    = 0x7B; // 0b01111011
-    // static constexpr uint8_t binary_lowest = 0xFB; // 0b11111011
-    // static constexpr uint8_t binary_qnan   = 0x79; // any sign, exp=1111, mant!=
-
-    __host__ __device__ static constexpr bf8_fnuz_t Min() { return bf8_fnuz_t(binary_min); }
-
-    __host__ __device__ static constexpr bf8_fnuz_t Max() { return bf8_fnuz_t(binary_max); }
-
-    __host__ __device__ static constexpr bf8_fnuz_t Lowest() { return bf8_fnuz_t(binary_lowest); }
-
-    __host__ __device__ static constexpr bf8_fnuz_t QuietNaN() { return bf8_fnuz_t(binary_qnan); }
-};
-
-template <>
-struct NumericLimits<f8_ocp_t>
-{
-    static constexpr uint8_t binary_min    = 0x08; // 0b00001000 = 2^-6
-    static constexpr uint8_t binary_max    = 0x7E; // 0b01111110 = 448
-    static constexpr uint8_t binary_lowest = 0xFE; // 0b11111110 = -448
-    static constexpr uint8_t binary_qnan   = 0x7F; // 0b01111111
-
-    __host__ __device__ static constexpr f8_ocp_t Min() { return bit_cast<f8_ocp_t>(binary_min); }
-
-    __host__ __device__ static constexpr f8_ocp_t Max() { return bit_cast<f8_ocp_t>(binary_max); }
-
-    __host__ __device__ static constexpr f8_ocp_t Lowest()
-    {
-        return bit_cast<f8_ocp_t>(binary_lowest);
-    }
-
-    __host__ __device__ static constexpr f8_ocp_t QuietNaN()
-    {
-        return bit_cast<f8_ocp_t>(binary_qnan);
-    }
-};
-
-template <>
-struct NumericLimits<bf8_ocp_t>
-{
-    static constexpr uint8_t binary_min    = 0x04; // 0b00000100 = 2^-14
-    static constexpr uint8_t binary_max    = 0x7B; // 0b01111011 = 57344
-    static constexpr uint8_t binary_lowest = 0xFB; // 0b11111011 = -57344
-    static constexpr uint8_t binary_qnan   = 0x7D; // 0b01111101
-
-    __host__ __device__ static constexpr bf8_ocp_t Min() { return bit_cast<bf8_ocp_t>(binary_min); }
-
-    __host__ __device__ static constexpr bf8_ocp_t Max() { return bit_cast<bf8_ocp_t>(binary_max); }
-
-    __host__ __device__ static constexpr bf8_ocp_t Lowest()
-    {
-        return bit_cast<bf8_ocp_t>(binary_lowest);
-    }
-
-    __host__ __device__ static constexpr bf8_ocp_t QuietNaN()
-    {
-        return bit_cast<bf8_ocp_t>(binary_qnan);
-    }
-};
-
-template <>
-struct NumericLimits<f4_t>
-{
-    static constexpr uint8_t binary_min_normal    = 0x2; // 0b0010
-    static constexpr uint8_t binary_max_normal    = 0x7; // 0b0111
-    static constexpr uint8_t binary_lowest_normal = 0xF; // 0b1111
-    static constexpr uint8_t binary_min_subnorm   = 0x1; // 0b0001
-    static constexpr uint8_t binary_max_subnorm   = 0x1; // 0b0001
-
-    static constexpr float data_max_normal_number    = 6;
-    static constexpr float data_min_subnormal_number = 0.5;
-
-    __host__ __device__ static constexpr f4_t Min() { return f4_t(binary_min_normal); }
-    __host__ __device__ static constexpr f4_t Max() { return f4_t(binary_max_normal); }
-    __host__ __device__ static constexpr f4_t Lowest() { return f4_t(binary_lowest_normal); }
-    __host__ __device__ static constexpr f4_t MinSubnorm() { return f4_t(binary_min_subnorm); }
-    __host__ __device__ static constexpr f4_t MaxSubnorm() { return f4_t(binary_max_subnorm); }
-
-    __host__ __device__ static constexpr float DataMaxNorm() { return data_max_normal_number; }
-    __host__ __device__ static constexpr float DataMinSubnorm()
-    {
-        return data_min_subnormal_number;
-    }
-};
-
-template <>
-struct NumericLimits<f6_t>
-{
-    static constexpr uint8_t binary_min_normal    = 0x08; // 0b001000
-    static constexpr uint8_t binary_max_normal    = 0x1F; // 0b011111
-    static constexpr uint8_t binary_lowest_normal = 0x3F; // 0b111111
-    static constexpr uint8_t binary_min_subnorm   = 0x01; // 0b000001
-    static constexpr uint8_t binary_max_subnorm   = 0x07; // 0b000111
-
-    static constexpr float data_max_normal_number    = 7.5;
-    static constexpr float data_min_subnormal_number = 0.125;
-
-    __host__ __device__ static constexpr f6_t Min() { return f6_t(binary_min_normal & 0b111111); }
-    __host__ __device__ static constexpr f6_t Max() { return f6_t(binary_max_normal & 0b111111); }
-    __host__ __device__ static constexpr f6_t Lowest()
-    {
-        return f6_t(binary_lowest_normal & 0b111111);
-    }
-    __host__ __device__ static constexpr f6_t MinSubnorm()
-    {
-        return f6_t(binary_min_subnorm & 0b111111);
-    }
-    __host__ __device__ static constexpr f6_t MaxSubnorm()
-    {
-        return f6_t(binary_max_subnorm & 0b111111);
-    }
-
-    __host__ __device__ static constexpr float DataMaxNorm() { return data_max_normal_number; }
-    __host__ __device__ static constexpr float DataMinSubnorm()
-    {
-        return data_min_subnormal_number;
-    }
-};
-
-template <>
-struct NumericLimits<bf6_t>
-{
-    static constexpr uint8_t binary_min_normal    = 0x08; // 0b001000
-    static constexpr uint8_t binary_max_normal    = 0x1F; // 0b011111
-    static constexpr uint8_t binary_lowest_normal = 0x3F; // 0b111111
-    static constexpr uint8_t binary_min_subnorm   = 0x01; // 0b000001
-    static constexpr uint8_t binary_max_subnorm   = 0x03; // 0b000011
-
-    static constexpr float data_max_normal_number    = 28;
-    static constexpr float data_min_subnormal_number = 0.0625;
-
-    __host__ __device__ static constexpr bf6_t Min() { return bf6_t(binary_min_normal); }
-    __host__ __device__ static constexpr bf6_t Max() { return bf6_t(binary_max_normal); }
-    __host__ __device__ static constexpr bf6_t Lowest() { return bf6_t(binary_lowest_normal); }
-    __host__ __device__ static constexpr bf6_t MinSubnorm() { return bf6_t(binary_min_subnorm); }
-    __host__ __device__ static constexpr bf6_t MaxSubnorm() { return bf6_t(binary_max_subnorm); }
-
-    __host__ __device__ static constexpr float DataMaxNorm() { return data_max_normal_number; }
-    __host__ __device__ static constexpr float DataMinSubnorm()
-    {
-        return data_min_subnormal_number;
-    }
-};
-
-template <>
-struct NumericLimits<e8m0_bexp_t>
-{
-    static constexpr e8m0_bexp_t binary_min  = 0x00; // 0b00000000
-    static constexpr e8m0_bexp_t binary_max  = 0xFE; // 0b11111110
-    static constexpr e8m0_bexp_t binary_qnan = 0xFF; // 0b11111111
-    static constexpr e8m0_bexp_t binary_1    = 0x7F; // 0b01111111
-    static constexpr e8m0_bexp_t binary_2    = 0x80; // 0b10000000
-    static constexpr e8m0_bexp_t binary_3    = 0x82; // 0b10000010
-    static constexpr e8m0_bexp_t binary_135  = 0x87; // 0b10000111
-    static constexpr e8m0_bexp_t binary_142  = 0x8E; // 0b10001110
-
-    __host__ __device__ static constexpr e8m0_bexp_t Min() { return e8m0_bexp_t(binary_min); }
-    __host__ __device__ static constexpr e8m0_bexp_t Max() { return e8m0_bexp_t(binary_max); }
-    __host__ __device__ static constexpr e8m0_bexp_t QuietNaN() { return e8m0_bexp_t(binary_qnan); }
-    __host__ __device__ static constexpr e8m0_bexp_t Binary_1() { return e8m0_bexp_t(binary_1); }
-    __host__ __device__ static constexpr e8m0_bexp_t Binary_2() { return e8m0_bexp_t(binary_2); }
-    __host__ __device__ static constexpr e8m0_bexp_t Binary_3() { return e8m0_bexp_t(binary_3); }
-    __host__ __device__ static constexpr e8m0_bexp_t Binary_135()
-    {
-        return e8m0_bexp_t(binary_135);
-    }
-    __host__ __device__ static constexpr e8m0_bexp_t Binary_142()
-    {
-        return e8m0_bexp_t(binary_142);
-    }
-};
+using int64_t = long;
 #endif
 
-template <typename T>
-struct NumericUtils
-{
-};
-
-template <>
-struct NumericUtils<float>
-{
-    static constexpr int exp            = 8;
-    static constexpr int mant           = 23;
-    static constexpr int bias           = 127;
-    static constexpr uint32_t nan_mask  = 0x7F800000;
-    static constexpr uint32_t head_mask = 0xFF800000;
-    static constexpr uint32_t mant_mask = 0x7FFFFF;
-    static constexpr uint32_t exp_mask  = 0xFF;
-    static constexpr uint32_t Inf       = 0x7F800000;
-    static constexpr uint32_t NegInf    = 0xFF800000;
-    static constexpr uint32_t NaN       = 0x7F800001;
-    static constexpr uint32_t Neg0      = 0x80000000;
-    static constexpr bool has_inf       = true;
-    using bitwise_type                  = uint32_t;
-};
-
-template <>
-struct NumericUtils<half_t>
-{
-    static constexpr int exp            = 5;
-    static constexpr int mant           = 10;
-    static constexpr int bias           = 15;
-    static constexpr uint16_t nan_mask  = 0x7C00;
-    static constexpr uint16_t head_mask = 0xFC00;
-    static constexpr uint16_t mant_mask = 0x3FF;
-    static constexpr uint16_t exp_mask  = 0x1F;
-    static constexpr uint32_t Inf       = 0x7C00;
-    static constexpr uint32_t NegInf    = 0xFC00;
-    static constexpr uint32_t NaN       = 0x7C01;
-    static constexpr uint32_t Neg0      = 0x8000;
-    static constexpr bool has_inf       = true;
-    using bitwise_type                  = uint16_t;
-};
-
-template <>
-struct NumericUtils<bhalf_t>
-{
-    static constexpr int exp  = 8;
-    static constexpr int mant = 7;
-    static constexpr int bias = 128; // negative zero nan mode
-    // static constexpr int bias = 127; // ieee mode
-};
-
-template <>
-struct NumericUtils<f8_fnuz_t>
-{
-    static constexpr int exp  = 4;
-    static constexpr int mant = 3;
-    static constexpr int bias = 8; // negative zero nan mode
-    // static constexpr int bias = 7; // ieee mode
-    static constexpr bool has_inf = false;
-};
-
-template <>
-struct NumericUtils<bf8_fnuz_t>
-{
-    static constexpr int exp  = 5;
-    static constexpr int mant = 2;
-    static constexpr int bias = 16; // negative zero nan mode
-    // static constexpr int bias = 15; // ieee mode
-    static constexpr bool has_inf = false;
-};
-template <>
-struct NumericUtils<f8_ocp_t>
-{
-    static constexpr int exp  = 4;
-    static constexpr int mant = 3;
-    static constexpr int bias = 7;
-};
-
-template <>
-struct NumericUtils<bf8_ocp_t>
-{
-    static constexpr int exp  = 5;
-    static constexpr int mant = 2;
-    static constexpr int bias = 15;
-};
-
-template <>
-struct NumericUtils<f4_t>
-{
-    static constexpr int exp           = 2;
-    static constexpr int mant          = 1;
-    static constexpr int bias          = 1;
-    static constexpr uint32_t sr_shift = 10;
-
-    static constexpr int unbiased_exp_min = 0;
-    static constexpr int unbiased_exp_max = 2;
-    static constexpr int biased_exp_min   = 1;
-    static constexpr int biased_exp_max   = 3;
-
-    static constexpr uint8_t positive_zero_mask = 0b0000;
-    static constexpr uint8_t negative_zero_mask = 0b1000;
-
-    static constexpr uint8_t one_mask      = 0b0010;
-    static constexpr uint8_t set_sign_mask = 0b0111;
-
-    static constexpr uint8_t data_max_positive_normal_mask = 0b0111;
-    static constexpr uint8_t data_max_negative_normal_mask = 0b1111;
-
-    static constexpr uint8_t data_max_positive_subnormal_mask = 0b0001;
-    static constexpr uint8_t data_max_negative_subnormal_mask = 0b1001;
-
-    static constexpr bool has_inf = false;
-
-    using bitwise_type = uint8_t;
-};
-
-template <>
-struct NumericUtils<f6_t>
-{
-    static constexpr int exp           = 2;
-    static constexpr int mant          = 3;
-    static constexpr int bias          = 1;
-    static constexpr uint32_t sr_shift = 12;
-
-    static constexpr int unbiased_exp_min = 0;
-    static constexpr int unbiased_exp_max = 2;
-    static constexpr int biased_exp_min   = 1;
-    static constexpr int biased_exp_max   = 3;
-
-    static constexpr uint8_t positive_zero_mask = 0b000000;
-    static constexpr uint8_t negative_zero_mask = 0b100000;
-
-    static constexpr uint8_t set_sign_mask = 0b011111;
-
-    static constexpr uint8_t data_max_positive_normal_mask = 0b011111;
-    static constexpr uint8_t data_max_negative_normal_mask = 0b111111;
-
-    static constexpr uint8_t data_max_positive_subnormal_mask = 0b000111;
-    static constexpr uint8_t data_max_negative_subnormal_mask = 0b100111;
-
-    static constexpr bool has_inf  = false;
-    static constexpr bool has_nan  = false;
-    static constexpr bool has_zero = true;
-
-    using bitwise_type = uint8_t;
-};
-
-template <>
-struct NumericUtils<bf6_t>
-{
-    static constexpr int exp           = 3;
-    static constexpr int mant          = 2;
-    static constexpr int bias          = 3;
-    static constexpr uint32_t sr_shift = 11;
-
-    static constexpr int unbiased_exp_min = -2;
-    static constexpr int unbiased_exp_max = 4;
-    static constexpr int biased_exp_min   = 1;
-    static constexpr int biased_exp_max   = 7;
-
-    static constexpr uint8_t positive_zero_mask = 0b000000;
-    static constexpr uint8_t negative_zero_mask = 0b100000;
-
-    static constexpr uint8_t set_sign_mask = 0b011111;
-
-    static constexpr uint8_t data_max_positive_normal_mask = 0b011111;
-    static constexpr uint8_t data_max_negative_normal_mask = 0b111111;
-
-    static constexpr uint8_t data_max_positive_subnormal_mask = 0b000011;
-    static constexpr uint8_t data_max_negative_subnormal_mask = 0b100011;
-
-    static constexpr bool has_inf  = false;
-    static constexpr bool has_nan  = false;
-    static constexpr bool has_zero = true;
-
-    using bitwise_type = uint8_t;
-};
-
-template <>
-struct NumericUtils<e8m0_bexp_t>
-{
-    static constexpr int exp  = 8;
-    static constexpr int mant = 0;
-    static constexpr int bias = 127;
-
-    static constexpr int unbiased_exp_min = -127;
-    static constexpr int unbiased_exp_max = 127;
-    static constexpr int biased_exp_min   = 0;
-    static constexpr int biased_exp_max   = 254;
-
-    using bitwise_type = uint8_t;
-};
 } // namespace ck
diff --git a/include/ck/utility/dtype_fp64.hpp b/include/ck/utility/dtype_fp64.hpp
new file mode 100644
index 0000000000..3c63d083ad
--- /dev/null
+++ b/include/ck/utility/dtype_fp64.hpp
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+namespace ck {
+// fp64
+using double2_t = typename vector_type<double, 2>::type;
+using double4_t = typename vector_type<double, 4>::type;
+} // namespace ck
diff --git a/include/ck/utility/dtype_vector.hpp b/include/ck/utility/dtype_vector.hpp
new file mode 100644
index 0000000000..9c40d923d3
--- /dev/null
+++ b/include/ck/utility/dtype_vector.hpp
@@ -0,0 +1,2136 @@
+// SPDX-License-Identifier: MIT
+// // // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/utility/data_type.hpp"
+
+namespace ck {
+
+// vector_type
+template <typename T, index_t N, typename Enable = void>
+struct vector_type;
+
+// Caution: DO NOT REMOVE
+// intentionally have only declaration but no definition to cause compilation failure when trying to
+// instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
+// vectors"
+template <typename T, index_t V, index_t N>
+struct vector_type<T __attribute__((ext_vector_type(V))), N>;
+
+// Caution: DO NOT REMOVE
+// intentionally have only declaration but no definition to cause compilation failure when trying to
+// instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
+// vectors"
+template <typename T, index_t V, index_t N>
+struct vector_type<vector_type<T, V>, N>;
+
+// vector_type_maker
+// This is the right way to handle "vector of vectors": making a bigger vector instead
+template <typename T, index_t N>
+struct vector_type_maker
+{
+    using type = vector_type<T, N>;
+};
+
+template <typename T, index_t N>
+struct scalar_type<vector_type<T, N>>
+{
+    using type                           = T;
+    static constexpr index_t vector_size = N;
+};
+
+template <typename T, index_t N0, index_t N1>
+struct vector_type_maker<T __attribute__((ext_vector_type(N1))), N0>
+{
+    using type = vector_type<T, N0 * N1>;
+};
+
+template <typename T, index_t N0, index_t N1>
+struct vector_type_maker<vector_type<T, N1>, N0>
+{
+    using type = vector_type<T, N0 * N1>;
+};
+
+template <typename T, index_t N>
+using vector_type_maker_t = typename vector_type_maker<T, N>::type;
+
+template <typename T, index_t N>
+__host__ __device__ constexpr auto make_vector_type(Number<N>)
+{
+    return typename vector_type_maker<T, N>::type{};
+}
+
+template <typename T>
+struct vector_type<T, 1, typename ck::enable_if_t<is_native_type<T>()>>
+{
+    using d1_t = T;
+    using type = d1_t;
+
+    union
+    {
+        T d1_;
+        StaticallyIndexedArray<T, 1> d1x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        return data_.d1x1_;
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        return data_.d1x1_;
+    }
+};
+
+__device__ int static err = 0;
+template <typename T>
+struct vector_type<T, 2, typename ck::enable_if_t<is_native_type<T>()>>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+
+    using type = d2_t;
+
+    union
+    {
+        d2_t d2_;
+        StaticallyIndexedArray<d1_t, 2> d1x2_;
+        StaticallyIndexedArray<d2_t, 1> d2x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x2_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x2_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 3, typename ck::enable_if_t<is_native_type<T>()>>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d3_t __attribute__((ext_vector_type(3)));
+
+    using type = d3_t;
+
+    union
+    {
+        d3_t d3_;
+        StaticallyIndexedArray<d1_t, 3> d1x3_;
+        StaticallyIndexedArray<d2_t, 1> d2x1_;
+        StaticallyIndexedArray<d3_t, 1> d3x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value || is_same<X, d3_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x3_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x1_;
+        }
+        else if constexpr(is_same<X, d3_t>::value)
+        {
+            return data_.d3x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value || is_same<X, d3_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x3_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x1_;
+        }
+        else if constexpr(is_same<X, d3_t>::value)
+        {
+            return data_.d3x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 4, typename ck::enable_if_t<is_native_type<T>()>>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+
+    using type = d4_t;
+
+    union
+    {
+        d4_t d4_;
+        StaticallyIndexedArray<d1_t, 4> d1x4_;
+        StaticallyIndexedArray<d2_t, 2> d2x2_;
+        StaticallyIndexedArray<d4_t, 1> d4x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value || is_same<X, d4_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x4_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x2_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value || is_same<X, d4_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x4_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x2_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 5, typename ck::enable_if_t<is_native_type<T>()>>
+{
+    using d1_t = T;
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+    typedef T d5_t __attribute__((ext_vector_type(5)));
+
+    using type = d5_t;
+
+    union
+    {
+        d5_t d5_;
+        StaticallyIndexedArray<d1_t, 5> d1x5_;
+        StaticallyIndexedArray<d4_t, 1> d4x1_;
+        StaticallyIndexedArray<d5_t, 1> d5x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d4_t>::value || is_same<X, d5_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x5_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x1_;
+        }
+        else if constexpr(is_same<X, d5_t>::value)
+        {
+            return data_.d5x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d4_t>::value || is_same<X, d5_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x5_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x1_;
+        }
+        else if constexpr(is_same<X, d5_t>::value)
+        {
+            return data_.d5x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 7, typename ck::enable_if_t<is_native_type<T>()>>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+    typedef T d7_t __attribute__((ext_vector_type(7)));
+
+    using type = d7_t;
+
+    union
+    {
+        d7_t d7_;
+        StaticallyIndexedArray<d1_t, 7> d1x7_;
+        StaticallyIndexedArray<d2_t, 3> d2x3_;
+        StaticallyIndexedArray<d4_t, 1> d4x1_;
+        StaticallyIndexedArray<d7_t, 1> d7x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d7_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x7_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x3_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x1_;
+        }
+        else if constexpr(is_same<X, d7_t>::value)
+        {
+            return data_.d7x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d7_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x7_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x3_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x1_;
+        }
+        else if constexpr(is_same<X, d7_t>::value)
+        {
+            return data_.d7x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 8, typename ck::enable_if_t<is_native_type<T>()>>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+    typedef T d8_t __attribute__((ext_vector_type(8)));
+
+    using type = d8_t;
+
+    union
+    {
+        d8_t d8_;
+        StaticallyIndexedArray<d1_t, 8> d1x8_;
+        StaticallyIndexedArray<d2_t, 4> d2x4_;
+        StaticallyIndexedArray<d4_t, 2> d4x2_;
+        StaticallyIndexedArray<d8_t, 1> d8x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x8_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x4_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x2_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x8_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x4_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x2_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 13, typename ck::enable_if_t<is_native_type<T>()>>
+{
+    using d1_t = T;
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+    typedef T d8_t __attribute__((ext_vector_type(8)));
+    typedef T d13_t __attribute__((ext_vector_type(13)));
+
+    using type = d13_t;
+
+    union
+    {
+        d13_t d13_;
+        StaticallyIndexedArray<d1_t, 13> d1x13_;
+        StaticallyIndexedArray<d4_t, 3> d4x3_;
+        StaticallyIndexedArray<d8_t, 1> d8x1_;
+        StaticallyIndexedArray<d13_t, 1> d13x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d4_t>::value ||
+                          is_same<X, d8_t>::value || is_same<X, d13_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x13_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x3_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x1_;
+        }
+        else if constexpr(is_same<X, d13_t>::value)
+        {
+            return data_.d13x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d4_t>::value ||
+                          is_same<X, d8_t>::value || is_same<X, d13_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x13_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x3_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x1_;
+        }
+        else if constexpr(is_same<X, d13_t>::value)
+        {
+            return data_.d13x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 16, typename ck::enable_if_t<is_native_type<T>()>>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+    typedef T d8_t __attribute__((ext_vector_type(8)));
+    typedef T d16_t __attribute__((ext_vector_type(16)));
+
+    using type = d16_t;
+
+    union
+    {
+        d16_t d16_;
+        StaticallyIndexedArray<d1_t, 16> d1x16_;
+        StaticallyIndexedArray<d2_t, 8> d2x8_;
+        StaticallyIndexedArray<d4_t, 4> d4x4_;
+        StaticallyIndexedArray<d8_t, 2> d8x2_;
+        StaticallyIndexedArray<d16_t, 1> d16x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x16_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x8_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x4_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x2_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x16_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x8_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x4_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x2_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 32, typename ck::enable_if_t<is_native_type<T>()>>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+    typedef T d8_t __attribute__((ext_vector_type(8)));
+    typedef T d16_t __attribute__((ext_vector_type(16)));
+    typedef T d32_t __attribute__((ext_vector_type(32)));
+
+    using type = d32_t;
+
+    union
+    {
+        d32_t d32_;
+        StaticallyIndexedArray<d1_t, 32> d1x32_;
+        StaticallyIndexedArray<d2_t, 16> d2x16_;
+        StaticallyIndexedArray<d4_t, 8> d4x8_;
+        StaticallyIndexedArray<d8_t, 4> d8x4_;
+        StaticallyIndexedArray<d16_t, 2> d16x2_;
+        StaticallyIndexedArray<d32_t, 1> d32x1_;
+    } data_ = {d32_t{0}};
+
+    __attribute__((host)) __attribute__((device)) constexpr vector_type() {}
+
+    __attribute__((host)) __attribute__((device)) constexpr vector_type(type v) { (void)v; }
+
+    // __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    // __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value || is_same<X, d32_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x32_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x16_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x8_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x4_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x2_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value || is_same<X, d32_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x32_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x16_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x8_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x4_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x2_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 64, typename ck::enable_if_t<is_native_type<T>()>>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+    typedef T d8_t __attribute__((ext_vector_type(8)));
+    typedef T d16_t __attribute__((ext_vector_type(16)));
+    typedef T d32_t __attribute__((ext_vector_type(32)));
+    typedef T d64_t __attribute__((ext_vector_type(64)));
+
+    using type = d64_t;
+
+    union
+    {
+        d64_t d64_;
+        StaticallyIndexedArray<d1_t, 64> d1x64_;
+        StaticallyIndexedArray<d2_t, 32> d2x32_;
+        StaticallyIndexedArray<d4_t, 16> d4x16_;
+        StaticallyIndexedArray<d8_t, 8> d8x8_;
+        StaticallyIndexedArray<d16_t, 4> d16x4_;
+        StaticallyIndexedArray<d32_t, 2> d32x2_;
+        StaticallyIndexedArray<d64_t, 1> d64x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
+                          is_same<X, d64_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x64_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x32_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x16_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x8_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x4_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x2_;
+        }
+        else if constexpr(is_same<X, d64_t>::value)
+        {
+            return data_.d64x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
+                          is_same<X, d64_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x64_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x32_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x16_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x8_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x4_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x2_;
+        }
+        else if constexpr(is_same<X, d64_t>::value)
+        {
+            return data_.d64x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 128, typename ck::enable_if_t<is_native_type<T>()>>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+    typedef T d8_t __attribute__((ext_vector_type(8)));
+    typedef T d16_t __attribute__((ext_vector_type(16)));
+    typedef T d32_t __attribute__((ext_vector_type(32)));
+    typedef T d64_t __attribute__((ext_vector_type(64)));
+    typedef T d128_t __attribute__((ext_vector_type(128)));
+
+    using type = d128_t;
+
+    union
+    {
+        d128_t d128_;
+        StaticallyIndexedArray<d1_t, 128> d1x128_;
+        StaticallyIndexedArray<d2_t, 64> d2x64_;
+        StaticallyIndexedArray<d4_t, 32> d4x32_;
+        StaticallyIndexedArray<d8_t, 16> d8x16_;
+        StaticallyIndexedArray<d16_t, 8> d16x8_;
+        StaticallyIndexedArray<d32_t, 4> d32x4_;
+        StaticallyIndexedArray<d64_t, 2> d64x2_;
+        StaticallyIndexedArray<d128_t, 1> d128x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
+                          is_same<X, d64_t>::value || is_same<X, d128_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x128_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x64_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x32_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x16_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x8_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x4_;
+        }
+        else if constexpr(is_same<X, d64_t>::value)
+        {
+            return data_.d64x2_;
+        }
+        else if constexpr(is_same<X, d128_t>::value)
+        {
+            return data_.d128x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
+                          is_same<X, d64_t>::value || is_same<X, d128_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x128_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x64_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x32_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x16_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x8_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x4_;
+        }
+        else if constexpr(is_same<X, d64_t>::value)
+        {
+            return data_.d64x2_;
+        }
+        else if constexpr(is_same<X, d128_t>::value)
+        {
+            return data_.d128x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 256, typename ck::enable_if_t<is_native_type<T>()>>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+    typedef T d8_t __attribute__((ext_vector_type(8)));
+    typedef T d16_t __attribute__((ext_vector_type(16)));
+    typedef T d32_t __attribute__((ext_vector_type(32)));
+    typedef T d64_t __attribute__((ext_vector_type(64)));
+    typedef T d128_t __attribute__((ext_vector_type(128)));
+    typedef T d256_t __attribute__((ext_vector_type(256)));
+
+    using type = d256_t;
+
+    union
+    {
+        d256_t d256_;
+        StaticallyIndexedArray<d1_t, 256> d1x256_;
+        StaticallyIndexedArray<d2_t, 128> d2x128_;
+        StaticallyIndexedArray<d4_t, 64> d4x64_;
+        StaticallyIndexedArray<d8_t, 32> d8x32_;
+        StaticallyIndexedArray<d16_t, 16> d16x16_;
+        StaticallyIndexedArray<d32_t, 8> d32x8_;
+        StaticallyIndexedArray<d64_t, 4> d64x4_;
+        StaticallyIndexedArray<d128_t, 2> d128x2_;
+        StaticallyIndexedArray<d256_t, 1> d256x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(
+            is_same<X, d1_t>::value || is_same<X, d2_t>::value || is_same<X, d4_t>::value ||
+                is_same<X, d8_t>::value || is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
+                is_same<X, d64_t>::value || is_same<X, d128_t>::value || is_same<X, d256_t>::value,
+            "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x256_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x128_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x64_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x32_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x16_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x8_;
+        }
+        else if constexpr(is_same<X, d64_t>::value)
+        {
+            return data_.d64x4_;
+        }
+        else if constexpr(is_same<X, d128_t>::value)
+        {
+            return data_.d128x2_;
+        }
+        else if constexpr(is_same<X, d256_t>::value)
+        {
+            return data_.d256x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(
+            is_same<X, d1_t>::value || is_same<X, d2_t>::value || is_same<X, d4_t>::value ||
+                is_same<X, d8_t>::value || is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
+                is_same<X, d64_t>::value || is_same<X, d128_t>::value || is_same<X, d256_t>::value,
+            "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x256_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x128_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x64_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x32_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x16_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x8_;
+        }
+        else if constexpr(is_same<X, d64_t>::value)
+        {
+            return data_.d64x4_;
+        }
+        else if constexpr(is_same<X, d128_t>::value)
+        {
+            return data_.d128x2_;
+        }
+        else if constexpr(is_same<X, d256_t>::value)
+        {
+            return data_.d256x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+};
+
+template <typename T, index_t N, typename Enable = void>
+struct non_native_vector_base;
+
+template <typename T>
+struct nnvb_data_t_selector
+{
+    using type = unsigned _BitInt(8 * sizeof(T));
+};
+
+template <>
+struct nnvb_data_t_selector<f8_ocp_t>
+{
+    using type = f8_ocp_t::data_type;
+};
+
+template <>
+struct nnvb_data_t_selector<bf8_ocp_t>
+{
+    using type = bf8_ocp_t::data_type;
+};
+
+template <>
+struct nnvb_data_t_selector<e8m0_bexp_t>
+{
+    using type = e8m0_bexp_t::type;
+};
+
+template <>
+struct nnvb_data_t_selector<f6x16_pk_t>
+{
+    using type = f6x16_pk_t::type;
+};
+
+template <>
+struct nnvb_data_t_selector<f6x32_pk_t>
+{
+    using type = f6x32_pk_t::type;
+};
+
+template <>
+struct nnvb_data_t_selector<bf6x16_pk_t>
+{
+    using type = bf6x16_pk_t::type;
+};
+
+template <>
+struct nnvb_data_t_selector<bf6x32_pk_t>
+{
+    using type = bf6x32_pk_t::type;
+};
+
+template <>
+struct nnvb_data_t_selector<pk_i4_t>
+{
+    using type = pk_i4_t::type;
+};
+
+template <typename T, index_t N>
+struct non_native_vector_base<
+    T,
+    N,
+    ck::enable_if_t<sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8>>
+{
+    using data_t = typename nnvb_data_t_selector<T>::type; // select data_t based on the size of T
+    static_assert(sizeof(T) == sizeof(data_t), "non_native_vector_base storage size mismatch");
+    using data_v = data_t __attribute__((ext_vector_type(N)));
+    using type   = non_native_vector_base<T, N>;
+
+    union alignas(next_pow2(N * sizeof(T)))
+    {
+        data_v dN; // storage vector;
+        StaticallyIndexedArray<data_t, N> dxN;
+        StaticallyIndexedArray<T, N> dTxN;
+        StaticallyIndexedArray<data_v, 1> dNx1;
+    } data_;
+
+    __host__ __device__ constexpr non_native_vector_base(data_t a) : data_{data_v(a)} {}
+    __host__ __device__ constexpr non_native_vector_base(T f)
+        : non_native_vector_base(bit_cast<data_t>(f))
+    {
+    }
+    __host__ __device__ constexpr non_native_vector_base() : non_native_vector_base(T{}){};
+    __host__ __device__ constexpr non_native_vector_base(data_v v) : data_{v} {}
+
+    __host__ __device__ constexpr operator data_v() const { return data_.dN; }
+    __host__ __device__ constexpr operator data_t() const
+    {
+        if constexpr(N == 1)
+        {
+            return data_.dxN[Number<0>{}];
+        }
+        else
+        {
+            return data_.dxN; // XXX this should cause an error
+        }
+    }
+    __host__ __device__ constexpr operator T() const
+    {
+        if constexpr(N == 1)
+        {
+            return data_.dTxN[Number<0>{}];
+        }
+        else
+        {
+            return data_.dTxN; // XXX this should cause an error
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same_v<X, data_t> || is_same_v<X, T> || is_same_v<X, data_v>,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same_v<X, data_t>)
+        {
+            return data_.dxN;
+        }
+        else if constexpr(is_same_v<X, T>)
+        {
+            return data_.dTxN;
+        }
+        else if constexpr(is_same_v<X, data_v>)
+        {
+            return data_.dNx1;
+        }
+        else
+        {
+            return err;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same_v<X, data_t> || is_same_v<X, T> || is_same_v<X, data_v>,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same_v<X, data_t>)
+        {
+            return data_.dxN;
+        }
+        else if constexpr(is_same_v<X, T>)
+        {
+            return data_.dTxN;
+        }
+        else if constexpr(is_same_v<X, data_v>)
+        {
+            return data_.dNx1;
+        }
+        else
+        {
+            return err;
+        }
+    }
+};
+
+// implementation for f6x16 and f6x32
+template <typename T, index_t N>
+struct non_native_vector_base<T, N, ck::enable_if_t<sizeof(T) == 12 || sizeof(T) == 24>>
+{
+    using data_t =
+        typename nnvb_data_t_selector<T>::type; // select data_t based on declared base type
+    using element_t = typename T::element_type; // select element_t based on declared element type
+    static_assert(sizeof(T) == sizeof(data_t), "non_native_vector_base storage size mismatch");
+    static constexpr size_t size_factor =
+        sizeof(data_t) / sizeof(element_t); // f6x16: 12/4 = 3, f6x32: 24/4 = 6
+    using data_v = element_t __attribute__((ext_vector_type(N * size_factor)));
+    using type   = non_native_vector_base<T, N>;
+
+    union alignas(next_pow2(N * sizeof(T)))
+    {
+        data_v dN; // storage vector;
+        StaticallyIndexedArray<data_t, N> dxN;
+        StaticallyIndexedArray<T, N> dTxN;
+        StaticallyIndexedArray<data_v, 1> dNx1;
+    } data_;
+
+    __host__ __device__ constexpr non_native_vector_base(data_t a)
+        : data_{data_v(a.At(Number<0>{}))}
+    {
+    }
+    __host__ __device__ constexpr non_native_vector_base(T f)
+        : non_native_vector_base(bit_cast<data_t>(f))
+    {
+    }
+    __host__ __device__ constexpr non_native_vector_base() : non_native_vector_base(T{}){};
+    __host__ __device__ constexpr non_native_vector_base(data_v v) : data_{v} {}
+
+    __host__ __device__ constexpr operator data_v() const { return data_.dN; }
+    __host__ __device__ constexpr operator data_t() const
+    {
+        if constexpr(N == 1)
+        {
+            return data_.dxN[Number<0>{}];
+        }
+        else
+        {
+            return data_.dxN; // XXX this should cause an error
+        }
+    }
+    __host__ __device__ constexpr operator T() const
+    {
+        if constexpr(N == 1)
+        {
+            return data_.dTxN[Number<0>{}];
+        }
+        else
+        {
+            return data_.dTxN; // XXX this should cause an error
+        }
+    }
+};
+
+template <typename T, index_t N>
+struct scalar_type<non_native_vector_base<T, N>>
+{
+    using type                           = typename non_native_vector_base<T, N>::data_t;
+    static constexpr index_t vector_size = N;
+};
+
+// non-native vector_type implementation
+template <typename T>
+struct vector_type<T, 1, typename ck::enable_if_t<!is_native_type<T>()>>
+{
+    using d1_t     = T;
+    using d1_nnv_t = non_native_vector_base<T, 1>;
+    using type     = d1_nnv_t;
+
+    union alignas(next_pow2(1 * sizeof(T)))
+    {
+        d1_t d1_;
+        StaticallyIndexedArray<d1_t, 1> d1x1_;
+        d1_nnv_t d1_nnv_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{d1_t{}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
+        {
+            return data_.d1x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
+        {
+            return data_.d1x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 2, typename ck::enable_if_t<!is_native_type<T>()>>
+{
+    using d1_t     = T;
+    using d1_nnv_t = non_native_vector_base<T, 1>;
+    using d2_t     = non_native_vector_base<T, 2>;
+
+    using type = d2_t;
+
+    union alignas(next_pow2(2 * sizeof(T)))
+    {
+        d2_t d2_;
+        StaticallyIndexedArray<d1_t, 2> d1x2_;
+        StaticallyIndexedArray<d2_t, 1> d2x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
+                          is_same<X, d2_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
+        {
+            return data_.d1x2_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
+                          is_same<X, d2_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
+        {
+            return data_.d1x2_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 4, typename ck::enable_if_t<!is_native_type<T>()>>
+{
+    using d1_t     = T;
+    using d1_nnv_t = non_native_vector_base<T, 1>;
+    using d2_t     = non_native_vector_base<T, 2>;
+    using d4_t     = non_native_vector_base<T, 4>;
+
+    using type = d4_t;
+
+    union alignas(next_pow2(4 * sizeof(T)))
+    {
+        d4_t d4_;
+        StaticallyIndexedArray<d1_t, 4> d1x4_;
+        StaticallyIndexedArray<d2_t, 2> d2x2_;
+        StaticallyIndexedArray<d4_t, 1> d4x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
+                          is_same<X, d2_t>::value || is_same<X, d4_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
+        {
+            return data_.d1x4_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x2_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
+                          is_same<X, d2_t>::value || is_same<X, d4_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
+        {
+            return data_.d1x4_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x2_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 8, typename ck::enable_if_t<!is_native_type<T>()>>
+{
+    using d1_t     = T;
+    using d1_nnv_t = non_native_vector_base<T, 1>;
+    using d2_t     = non_native_vector_base<T, 2>;
+    using d4_t     = non_native_vector_base<T, 4>;
+    using d8_t     = non_native_vector_base<T, 8>;
+
+    using type = d8_t;
+
+    union alignas(next_pow2(8 * sizeof(T)))
+    {
+        d8_t d8_;
+        StaticallyIndexedArray<d1_t, 8> d1x8_;
+        StaticallyIndexedArray<d2_t, 4> d2x4_;
+        StaticallyIndexedArray<d4_t, 2> d4x2_;
+        StaticallyIndexedArray<d8_t, 1> d8x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
+                          is_same<X, d2_t>::value || is_same<X, d4_t>::value ||
+                          is_same<X, d8_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
+        {
+            return data_.d1x8_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x4_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x2_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
+                          is_same<X, d2_t>::value || is_same<X, d4_t>::value ||
+                          is_same<X, d8_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
+        {
+            return data_.d1x8_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x4_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x2_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 16, typename ck::enable_if_t<!is_native_type<T>()>>
+{
+    using d1_t     = T;
+    using d1_nnv_t = non_native_vector_base<T, 1>;
+    using d2_t     = non_native_vector_base<T, 2>;
+    using d4_t     = non_native_vector_base<T, 4>;
+    using d8_t     = non_native_vector_base<T, 8>;
+    using d16_t    = non_native_vector_base<T, 16>;
+
+    using type = d16_t;
+
+    union alignas(next_pow2(16 * sizeof(T)))
+    {
+        d16_t d16_;
+        StaticallyIndexedArray<d1_t, 16> d1x16_;
+        StaticallyIndexedArray<d2_t, 8> d2x8_;
+        StaticallyIndexedArray<d4_t, 4> d4x4_;
+        StaticallyIndexedArray<d8_t, 2> d8x2_;
+        StaticallyIndexedArray<d16_t, 1> d16x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
+                          is_same<X, d2_t>::value || is_same<X, d4_t>::value ||
+                          is_same<X, d8_t>::value || is_same<X, d16_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
+        {
+            return data_.d1x16_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x8_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x4_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x2_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
+                          is_same<X, d2_t>::value || is_same<X, d4_t>::value ||
+                          is_same<X, d8_t>::value || is_same<X, d16_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
+        {
+            return data_.d1x16_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x8_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x4_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x2_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 32, typename ck::enable_if_t<!is_native_type<T>()>>
+{
+    using d1_t  = T;
+    using d2_t  = non_native_vector_base<T, 2>;
+    using d4_t  = non_native_vector_base<T, 4>;
+    using d8_t  = non_native_vector_base<T, 8>;
+    using d16_t = non_native_vector_base<T, 16>;
+    using d32_t = non_native_vector_base<T, 32>;
+
+    using type = d32_t;
+
+    union alignas(next_pow2(32 * sizeof(T)))
+    {
+        d32_t d32_;
+        StaticallyIndexedArray<d1_t, 32> d1x32_;
+        StaticallyIndexedArray<d2_t, 16> d2x16_;
+        StaticallyIndexedArray<d4_t, 8> d4x8_;
+        StaticallyIndexedArray<d8_t, 4> d8x4_;
+        StaticallyIndexedArray<d16_t, 2> d16x2_;
+        StaticallyIndexedArray<d32_t, 1> d32x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value || is_same<X, d32_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x32_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x16_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x8_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x4_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x2_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value || is_same<X, d32_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x32_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x16_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x8_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x4_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x2_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 64, typename ck::enable_if_t<!is_native_type<T>()>>
+{
+    using d1_t  = T;
+    using d2_t  = non_native_vector_base<T, 2>;
+    using d4_t  = non_native_vector_base<T, 4>;
+    using d8_t  = non_native_vector_base<T, 8>;
+    using d16_t = non_native_vector_base<T, 16>;
+    using d32_t = non_native_vector_base<T, 32>;
+    using d64_t = non_native_vector_base<T, 64>;
+
+    using type = d64_t;
+
+    union alignas(next_pow2(64 * sizeof(T)))
+    {
+        d64_t d64_;
+        StaticallyIndexedArray<d1_t, 64> d1x64_;
+        StaticallyIndexedArray<d2_t, 32> d2x32_;
+        StaticallyIndexedArray<d4_t, 16> d4x16_;
+        StaticallyIndexedArray<d8_t, 8> d8x8_;
+        StaticallyIndexedArray<d16_t, 4> d16x4_;
+        StaticallyIndexedArray<d32_t, 2> d32x2_;
+        StaticallyIndexedArray<d64_t, 1> d64x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
+                          is_same<X, d64_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x64_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x32_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x16_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x8_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x4_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x2_;
+        }
+        else if constexpr(is_same<X, d64_t>::value)
+        {
+            return data_.d64x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
+                          is_same<X, d64_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x64_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x32_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x16_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x8_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x4_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x2_;
+        }
+        else if constexpr(is_same<X, d64_t>::value)
+        {
+            return data_.d64x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+};
+
+// fp32
+using float2_t  = typename vector_type<float, 2>::type;
+using float4_t  = typename vector_type<float, 4>::type;
+using float8_t  = typename vector_type<float, 8>::type;
+using float16_t = typename vector_type<float, 16>::type;
+using float32_t = typename vector_type<float, 32>::type;
+using float64_t = typename vector_type<float, 64>::type;
+
+// fp16
+using half2_t  = typename vector_type<half_t, 2>::type;
+using half4_t  = typename vector_type<half_t, 4>::type;
+using half8_t  = typename vector_type<half_t, 8>::type;
+using half16_t = typename vector_type<half_t, 16>::type;
+using half32_t = typename vector_type<half_t, 32>::type;
+
+// bfp16
+using bhalf2_t  = typename vector_type<bhalf_t, 2>::type;
+using bhalf4_t  = typename vector_type<bhalf_t, 4>::type;
+using bhalf8_t  = typename vector_type<bhalf_t, 8>::type;
+using bhalf16_t = typename vector_type<bhalf_t, 16>::type;
+using bhalf32_t = typename vector_type<bhalf_t, 32>::type;
+
+// i32
+using int32x2_t  = typename vector_type<int32_t, 2>::type;
+using int32x4_t  = typename vector_type<int32_t, 4>::type;
+using int32x8_t  = typename vector_type<int32_t, 8>::type;
+using int32x16_t = typename vector_type<int32_t, 16>::type;
+using int32x32_t = typename vector_type<int32_t, 32>::type;
+using int32x64_t = typename vector_type<int32_t, 64>::type;
+
+// i8
+using int8x2_t  = typename vector_type<int8_t, 2>::type;
+using int8x4_t  = typename vector_type<int8_t, 4>::type;
+using int8x8_t  = typename vector_type<int8_t, 8>::type;
+using int8x16_t = typename vector_type<int8_t, 16>::type;
+using int8x32_t = typename vector_type<int8_t, 32>::type;
+using int8x64_t = typename vector_type<int8_t, 64>::type;
+
+// f8
+using f8x2_fnuz_t  = typename vector_type<f8_fnuz_t, 2>::type;
+using f8x4_fnuz_t  = typename vector_type<f8_fnuz_t, 4>::type;
+using f8x8_fnuz_t  = typename vector_type<f8_fnuz_t, 8>::type;
+using f8x16_fnuz_t = typename vector_type<f8_fnuz_t, 16>::type;
+using f8x32_fnuz_t = typename vector_type<f8_fnuz_t, 32>::type;
+using f8x64_fnuz_t = typename vector_type<f8_fnuz_t, 64>::type;
+
+// bf8
+using bf8x2_fnuz_t  = typename vector_type<bf8_fnuz_t, 2>::type;
+using bf8x4_fnuz_t  = typename vector_type<bf8_fnuz_t, 4>::type;
+using bf8x8_fnuz_t  = typename vector_type<bf8_fnuz_t, 8>::type;
+using bf8x16_fnuz_t = typename vector_type<bf8_fnuz_t, 16>::type;
+using bf8x32_fnuz_t = typename vector_type<bf8_fnuz_t, 32>::type;
+using bf8x64_fnuz_t = typename vector_type<bf8_fnuz_t, 64>::type;
+
+// f8
+using f8x2_ocp_t  = typename vector_type<f8_ocp_t, 2>::type;
+using f8x4_ocp_t  = typename vector_type<f8_ocp_t, 4>::type;
+using f8x8_ocp_t  = typename vector_type<f8_ocp_t, 8>::type;
+using f8x16_ocp_t = typename vector_type<f8_ocp_t, 16>::type;
+using f8x32_ocp_t = typename vector_type<f8_ocp_t, 32>::type;
+using f8x64_ocp_t = typename vector_type<f8_ocp_t, 64>::type;
+
+// bf8
+using bf8x2_ocp_t  = typename vector_type<bf8_ocp_t, 2>::type;
+using bf8x4_ocp_t  = typename vector_type<bf8_ocp_t, 4>::type;
+using bf8x8_ocp_t  = typename vector_type<bf8_ocp_t, 8>::type;
+using bf8x16_ocp_t = typename vector_type<bf8_ocp_t, 16>::type;
+using bf8x32_ocp_t = typename vector_type<bf8_ocp_t, 32>::type;
+using bf8x64_ocp_t = typename vector_type<bf8_ocp_t, 64>::type;
+
+#if CK_FP8_TYPE_OCP
+// f8
+using f8x2_t  = f8x2_ocp_t;
+using f8x4_t  = f8x4_ocp_t;
+using f8x8_t  = f8x8_ocp_t;
+using f8x16_t = f8x16_ocp_t;
+using f8x32_t = f8x32_ocp_t;
+using f8x64_t = f8x64_ocp_t;
+
+// bf8
+using bf8x2_t  = bf8x2_ocp_t;
+using bf8x4_t  = bf8x4_ocp_t;
+using bf8x8_t  = bf8x8_ocp_t;
+using bf8x16_t = bf8x16_ocp_t;
+using bf8x32_t = bf8x32_ocp_t;
+using bf8x64_t = bf8x64_ocp_t;
+#elif CK_FP8_TYPE_FNUZ
+// f8
+using f8x2_t  = f8x2_fnuz_t;
+using f8x4_t  = f8x4_fnuz_t;
+using f8x8_t  = f8x8_fnuz_t;
+using f8x16_t = f8x16_fnuz_t;
+using f8x32_t = f8x32_fnuz_t;
+using f8x64_t = f8x64_fnuz_t;
+
+// bf8
+using bf8x2_t  = bf8x2_fnuz_t;
+using bf8x4_t  = bf8x4_fnuz_t;
+using bf8x8_t  = bf8x8_fnuz_t;
+using bf8x16_t = bf8x16_fnuz_t;
+using bf8x32_t = bf8x32_fnuz_t;
+using bf8x64_t = bf8x64_fnuz_t;
+#endif
+
+// u8
+using uint8x2_t  = typename vector_type<uint8_t, 2>::type;
+using uint8x4_t  = typename vector_type<uint8_t, 4>::type;
+using uint8x8_t  = typename vector_type<uint8_t, 8>::type;
+using uint8x16_t = typename vector_type<uint8_t, 16>::type;
+using uint8x32_t = typename vector_type<uint8_t, 32>::type;
+using uint8x64_t = typename vector_type<uint8_t, 64>::type;
+
+// f4
+using f4x2_t  = typename vector_type<f4x2_pk_t, 1>::type;
+using f4x4_t  = typename vector_type<f4x2_pk_t, 2>::type;
+using f4x8_t  = typename vector_type<f4x2_pk_t, 4>::type;
+using f4x16_t = typename vector_type<f4x2_pk_t, 8>::type;
+using f4x32_t = typename vector_type<f4x2_pk_t, 16>::type;
+using f4x64_t = typename vector_type<f4x2_pk_t, 32>::type;
+
+// f6
+using f6x16_t = typename vector_type<f6x16_pk_t, 1>::type;
+using f6x32_t = typename vector_type<f6x32_pk_t, 1>::type;
+
+// bf6
+using bf6x16_t = typename vector_type<bf6x16_pk_t, 1>::type;
+using bf6x32_t = typename vector_type<bf6x32_pk_t, 1>::type;
+
+// pack int4
+using pk_i4x2_t = typename vector_type<pk_i4_t, 2>::type;
+using pk_i4x4_t = typename vector_type<pk_i4_t, 4>::type;
+using pk_i4x8_t = typename vector_type<pk_i4_t, 8>::type;
+
+} // namespace ck
diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp
index 6de17a6152..1d80f196b5 100644
--- a/include/ck/utility/dynamic_buffer.hpp
+++ b/include/ck/utility/dynamic_buffer.hpp
@@ -7,7 +7,11 @@
 #include "ck/utility/data_type.hpp"
 #include "enable_if.hpp"
 #include "c_style_pointer_cast.hpp"
+#if __clang_major__ == 20
+#include "amd_buffer_addressing_builtins.hpp"
+#else
 #include "amd_buffer_addressing.hpp"
+#endif
 #include "generic_memory_space_atomic.hpp"
 
 namespace ck {
@@ -20,7 +24,8 @@ template <AddressSpaceEnum BufferAddressSpace,
           typename T,
           typename ElementSpaceSize,
           bool InvalidElementUseNumericalZeroValue,
-          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
+          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence,
+          typename IndexType               = index_t>
 struct DynamicBuffer
 {
     using type = T;
@@ -55,16 +60,16 @@ struct DynamicBuffer
         return BufferAddressSpace;
     }
 
-    __host__ __device__ constexpr const T& operator[](index_t i) const { return p_data_[i]; }
+    __host__ __device__ constexpr const T& operator[](IndexType i) const { return p_data_[i]; }
 
-    __host__ __device__ constexpr T& operator()(index_t i) { return p_data_[i]; }
+    __host__ __device__ constexpr T& operator()(IndexType i) { return p_data_[i]; }
 
     template <typename X,
               typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
                                          typename scalar_type<remove_cvref_t<T>>::type>::value ||
                                      !is_native_type<X>(),
                                  bool>::type = false>
-    __host__ __device__ constexpr auto Get(index_t i, bool is_valid_element) const
+    __host__ __device__ constexpr auto Get(IndexType i, bool is_valid_element) const
     {
         // X contains multiple T
         constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
@@ -75,7 +80,7 @@ struct DynamicBuffer
                       "wrong! X should contain multiple T");
 
 #if CK_USE_AMD_BUFFER_LOAD
-        bool constexpr use_amd_buffer_addressing = true;
+        bool constexpr use_amd_buffer_addressing = sizeof(IndexType) <= sizeof(int32_t);
 #else
         bool constexpr use_amd_buffer_addressing = false;
 #endif
@@ -136,7 +141,7 @@ struct DynamicBuffer
               typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
                                          typename scalar_type<remove_cvref_t<T>>::type>::value,
                                  bool>::type = false>
-    __host__ __device__ void Update(index_t i, bool is_valid_element, const X& x)
+    __host__ __device__ void Update(IndexType i, bool is_valid_element, const X& x)
     {
         if constexpr(Op == InMemoryDataOperationEnum::Set)
         {
@@ -187,8 +192,8 @@ struct DynamicBuffer
 
     template <typename DstBuffer, index_t NumElemsPerThread>
     __host__ __device__ void DirectCopyToLds(DstBuffer& dst_buf,
-                                             index_t src_offset,
-                                             index_t dst_offset,
+                                             IndexType src_offset,
+                                             IndexType dst_offset,
                                              bool is_valid_element) const
     {
         // Copy data from global to LDS memory using direct loads.
@@ -210,7 +215,7 @@ struct DynamicBuffer
                                          typename scalar_type<remove_cvref_t<T>>::type>::value ||
                                      !is_native_type<X>(),
                                  bool>::type = false>
-    __host__ __device__ void Set(index_t i, bool is_valid_element, const X& x)
+    __host__ __device__ void Set(IndexType i, bool is_valid_element, const X& x)
     {
         // X contains multiple T
         constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
@@ -220,8 +225,8 @@ struct DynamicBuffer
         static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
                       "wrong! X should contain multiple T");
 
-#if CK_USE_AMD_BUFFER_STORE
-        bool constexpr use_amd_buffer_addressing = true;
+#if CK_USE_AMD_BUFFER_LOAD
+        bool constexpr use_amd_buffer_addressing = sizeof(IndexType) <= sizeof(int32_t);
 #else
         bool constexpr use_amd_buffer_addressing      = false;
 #endif
@@ -338,11 +343,12 @@ struct DynamicBuffer
         {
             if(is_valid_element)
             {
-#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
+#if 0
                 X tmp = x;
 
                 __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
 #else
+                // if(i >= 2169041600)
                 *c_style_pointer_cast<X*>(&p_data_[i]) = x;
 #endif
             }
@@ -353,7 +359,7 @@ struct DynamicBuffer
               typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
                                          typename scalar_type<remove_cvref_t<T>>::type>::value,
                                  bool>::type = false>
-    __host__ __device__ void AtomicAdd(index_t i, bool is_valid_element, const X& x)
+    __host__ __device__ void AtomicAdd(IndexType i, bool is_valid_element, const X& x)
     {
         using scalar_t = typename scalar_type<remove_cvref_t<T>>::type;
 
@@ -374,12 +380,14 @@ struct DynamicBuffer
             (is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0) ||
             (is_same_v<remove_cvref_t<scalar_t>, bhalf_t> && scalar_per_x_vector % 2 == 0);
 #elif CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && (!CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT)
-        bool constexpr use_amd_buffer_addressing = is_same_v<remove_cvref_t<scalar_t>, int32_t>;
+        bool constexpr use_amd_buffer_addressing =
+            sizeof(IndexType) <= sizeof(int32_t) && is_same_v<remove_cvref_t<scalar_t>, int32_t>;
 #elif(!CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER) && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
         bool constexpr use_amd_buffer_addressing =
-            is_same_v<remove_cvref_t<scalar_t>, float> ||
-            (is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0) ||
-            (is_same_v<remove_cvref_t<scalar_t>, bhalf_t> && scalar_per_x_vector % 2 == 0);
+            sizeof(IndexType) <= sizeof(int32_t) &&
+            (is_same_v<remove_cvref_t<scalar_t>, float> ||
+             (is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0) ||
+             (is_same_v<remove_cvref_t<scalar_t>, bhalf_t> && scalar_per_x_vector % 2 == 0));
 #else
         bool constexpr use_amd_buffer_addressing = false;
 #endif
@@ -404,12 +412,12 @@ struct DynamicBuffer
               typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
                                          typename scalar_type<remove_cvref_t<T>>::type>::value,
                                  bool>::type = false>
-    __host__ __device__ void AtomicMax(index_t i, bool is_valid_element, const X& x)
+    __host__ __device__ void AtomicMax(IndexType i, bool is_valid_element, const X& x)
     {
         // X contains multiple T
-        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+        constexpr IndexType scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
 
-        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+        constexpr IndexType scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
 
         static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
                       "wrong! X should contain multiple T");
@@ -417,8 +425,9 @@ struct DynamicBuffer
         static_assert(GetAddressSpace() == AddressSpaceEnum::Global, "only support global mem");
 
 #if CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64
-        using scalar_t                           = typename scalar_type<remove_cvref_t<T>>::type;
-        bool constexpr use_amd_buffer_addressing = is_same_v<remove_cvref_t<scalar_t>, double>;
+        using scalar_t = typename scalar_type<remove_cvref_t<T>>::type;
+        bool constexpr use_amd_buffer_addressing =
+            sizeof(IndexType) <= sizeof(int32_t) && is_same_v<remove_cvref_t<scalar_t>, double>;
 #else
         bool constexpr use_amd_buffer_addressing = false;
 #endif
@@ -451,6 +460,17 @@ __host__ __device__ constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize el
         p, element_space_size};
 }
 
+template <AddressSpaceEnum BufferAddressSpace,
+          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence,
+          typename T,
+          typename ElementSpaceSize>
+__host__ __device__ constexpr auto make_long_dynamic_buffer(T* p,
+                                                            ElementSpaceSize element_space_size)
+{
+    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, true, coherence, long_index_t>{
+        p, element_space_size};
+}
+
 template <
     AddressSpaceEnum BufferAddressSpace,
     AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence,
diff --git a/include/ck/utility/e8m0.hpp b/include/ck/utility/e8m0.hpp
index a692f533f8..f7d2a2f594 100644
--- a/include/ck/utility/e8m0.hpp
+++ b/include/ck/utility/e8m0.hpp
@@ -67,10 +67,10 @@ struct e8m0_bexp_t
 namespace utils {
 
 template <typename T>
-__host__ __device__ inline int get_exponent_value(T x);
+__host__ __device__ inline constexpr int32_t get_exponent_value(T x);
 
 template <>
-__host__ __device__ inline int get_exponent_value<e8m0_bexp_t>(e8m0_bexp_t x)
+__host__ __device__ inline constexpr int32_t get_exponent_value<e8m0_bexp_t>(e8m0_bexp_t x)
 {
     return x.data;
 }
diff --git a/include/ck/utility/env.hpp b/include/ck/utility/env.hpp
index 809f302f74..469fb70f10 100644
--- a/include/ck/utility/env.hpp
+++ b/include/ck/utility/env.hpp
@@ -184,4 +184,9 @@ void UpdateEnvVar(EnvVar, const std::string_view& val)
 }
 
 } // namespace ck
+
+// environment variable to enable logging:
+// export CK_LOGGING=ON or CK_LOGGING=1 or CK_LOGGING=ENABLED
+CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
+
 #endif
diff --git a/include/ck/utility/f8_utils.hpp b/include/ck/utility/f8_utils.hpp
index 2533073225..799683ae65 100644
--- a/include/ck/utility/f8_utils.hpp
+++ b/include/ck/utility/f8_utils.hpp
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "ck/utility/data_type.hpp"
+#include "ck/utility/numeric_utils.hpp"
 
 namespace ck {
 
diff --git a/include/ck/utility/functional2.hpp b/include/ck/utility/functional2.hpp
index 99c65f4eb8..a11963cb47 100644
--- a/include/ck/utility/functional2.hpp
+++ b/include/ck/utility/functional2.hpp
@@ -46,4 +46,28 @@ struct static_for
     }
 };
 
+namespace detail {
+
+template <typename T, T... Is>
+struct applier
+{
+    template <typename F>
+    __host__ __device__ constexpr void operator()(F f) const
+    {
+        // tweak -fbracket-depth if compilation fails. Clang default limit is 256
+        (f(Number<Is>{}), ...);
+    }
+};
+
+template <int32_t Size> // == sizeof...(Is)
+using make_applier = __make_integer_seq<applier, index_t, Size>;
+
+} // namespace detail
+
+template <index_t N>
+struct static_for<0, N, 1> : detail::make_applier<N>
+{
+    using detail::make_applier<N>::operator();
+};
+
 } // namespace ck
diff --git a/include/ck/utility/generic_memory_space_atomic.hpp b/include/ck/utility/generic_memory_space_atomic.hpp
index 98f40a4363..ab9cc4199c 100644
--- a/include/ck/utility/generic_memory_space_atomic.hpp
+++ b/include/ck/utility/generic_memory_space_atomic.hpp
@@ -3,6 +3,7 @@
 
 #pragma once
 #include "data_type.hpp"
+#include "dtype_fp64.hpp"
 
 namespace ck {
 
diff --git a/include/ck/utility/magic_division.hpp b/include/ck/utility/magic_division.hpp
index 05ae9093e2..7b079c541c 100644
--- a/include/ck/utility/magic_division.hpp
+++ b/include/ck/utility/magic_division.hpp
@@ -4,7 +4,7 @@
 #pragma once
 
 #include "ck/ck.hpp"
-#include "data_type.hpp"
+#include "numeric_limits.hpp"
 #include "integral_constant.hpp"
 #include "number.hpp"
 #include "type.hpp"
diff --git a/include/ck/utility/mxf4_utils.hpp b/include/ck/utility/mxf4_utils.hpp
index f6536377ba..b0b5297f77 100644
--- a/include/ck/utility/mxf4_utils.hpp
+++ b/include/ck/utility/mxf4_utils.hpp
@@ -1,9 +1,10 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
+#ifndef CK_CODE_GEN_RTC
 #pragma once
 
-#include "ck/utility/data_type.hpp"
+#include "ck/utility/numeric_limits.hpp"
 #include "ck/utility/mxfp_utils.hpp"
 
 namespace ck::utils {
@@ -39,9 +40,7 @@ template <>
 __host__ __device__ inline float to_float<f4_t>(e8m0_bexp_t const scale, f4_t const data)
 {
     if(is_nan<f4_t>(scale, data))
-    {
-        return std::numeric_limits<float>::quiet_NaN();
-    }
+        return NumericLimits<float>::QuietNaN();
 
     if(is_zero<f4_t>(scale, data))
         return 0.0f;
@@ -105,5 +104,5 @@ __host__ __device__ inline f4_t sat_convert_to_type_sr<f4_t>(float value, uint32
 
     return res;
 }
-
 } // namespace ck::utils
+#endif
diff --git a/include/ck/utility/mxf6_utils.hpp b/include/ck/utility/mxf6_utils.hpp
index e3b37bedda..cf68188b3e 100644
--- a/include/ck/utility/mxf6_utils.hpp
+++ b/include/ck/utility/mxf6_utils.hpp
@@ -1,9 +1,10 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
+#ifndef CK_CODE_GEN_RTC
 #pragma once
 
-#include "ck/utility/data_type.hpp"
+#include "ck/utility/numeric_limits.hpp"
 #include "ck/utility/mxfp_utils.hpp"
 
 namespace ck::utils {
@@ -138,7 +139,7 @@ template <>
 __host__ __device__ inline float to_float<f6_t>(e8m0_bexp_t const scale, f6_t const data)
 {
     if(is_nan<f6_t>(scale, data))
-        return std::numeric_limits<float>::quiet_NaN();
+        return NumericLimits<float>::QuietNaN();
 
     if(is_zero<f6_t>(scale, data))
         return 0.0f;
@@ -164,7 +165,7 @@ template <>
 __host__ __device__ inline float to_float<bf6_t>(e8m0_bexp_t const scale, bf6_t const data)
 {
     if(is_nan<bf6_t>(scale, data))
-        return std::numeric_limits<float>::quiet_NaN();
+        return NumericLimits<float>::QuietNaN();
 
     if(is_zero<bf6_t>(scale, data))
         return 0.0f;
@@ -307,7 +308,6 @@ __host__ __device__ inline bf6_t sat_convert_to_type_sr<bf6_t>(float value, uint
     if(std::isnan(value))
         return sign ? NumericUtils<bf6_t>::data_max_negative_normal_mask
                     : NumericUtils<bf6_t>::data_max_positive_normal_mask;
-
     if(std::abs(value) > NumericLimits<bf6_t>::Max()) // covers inf case as well
         return sign ? NumericUtils<bf6_t>::data_max_negative_normal_mask
                     : NumericUtils<bf6_t>::data_max_positive_normal_mask;
@@ -321,5 +321,5 @@ __host__ __device__ inline bf6_t sat_convert_to_type_sr<bf6_t>(float value, uint
 
     return res;
 }
-
 } // namespace ck::utils
+#endif
diff --git a/include/ck/utility/mxf8_utils.hpp b/include/ck/utility/mxf8_utils.hpp
index 2dbf997f6a..9046a24a3a 100644
--- a/include/ck/utility/mxf8_utils.hpp
+++ b/include/ck/utility/mxf8_utils.hpp
@@ -1,4 +1,7 @@
-#include "ck/utility/data_type.hpp"
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/numeric_limits.hpp"
 #include "ck/utility/mxfp_utils.hpp"
 
 #if defined(__gfx950__) && __HIP_DEVICE_COMPILE__
@@ -36,7 +39,7 @@ static __device__ float cast_to_f32_from_f8_scaled(float scale, fp8_storage_t v)
 }
 
 template <ck_fp8_interpretation_t interpret>
-static __device__ float2_t cast_to_f32x2_from_f8x2_scaled(float scale, fp8x2_storage_t v)
+static __device__ float2_t cast_to_f32_from_f8_scaled(float scale, fp8x2_storage_t v)
 {
     const auto i16val = bit_cast<uint16_t>(v);
 
diff --git a/include/ck/utility/mxfp_utils.hpp b/include/ck/utility/mxfp_utils.hpp
index e23836c87f..cf7a3e8713 100644
--- a/include/ck/utility/mxfp_utils.hpp
+++ b/include/ck/utility/mxfp_utils.hpp
@@ -3,6 +3,11 @@
 
 #pragma once
 
+#include "ck/utility/data_type.hpp"
+
+#ifdef CK_CODE_GEN_RTC
+#define UINT_MAX 4294967295
+#endif
 namespace ck::utils {
 
 union cvt
@@ -27,13 +32,13 @@ template <typename T>
 __host__ __device__ inline bool is_inf(e8m0_bexp_t const scale, T const data);
 
 template <typename T>
-__host__ __device__ inline int get_exponent_value(T x)
+__host__ __device__ inline constexpr int32_t get_exponent_value(T x)
 {
     x >>= NumericUtils<T>::mant;
 
     x &= ((1 << NumericUtils<T>::exp) - 1);
 
-    return static_cast<int>(x);
+    return static_cast<int32_t>(x);
 }
 
 template <typename T>
@@ -94,7 +99,7 @@ template <typename T>
 __host__ __device__ T sat_convert_to_type_sr(float value, uint32_t seed);
 
 template <typename T>
-inline T convert_to_type(float value)
+__host__ __device__ inline T convert_to_type(float value)
 {
     using bitwise_type = typename NumericUtils<T>::bitwise_type;
 
@@ -253,7 +258,7 @@ inline T convert_to_type(float value)
 }
 
 template <typename T>
-inline T convert_to_type_sr(float value, uint32_t seed)
+__host__ __device__ inline T convert_to_type_sr(float value, uint32_t seed)
 {
     if(std::abs(value) > NumericLimits<T>::Max())
     {
@@ -380,5 +385,4 @@ inline T convert_to_type_sr(float value, uint32_t seed)
     auto val = sign | biased_exp << NumericUtils<T>::mant | mant;
     return val;
 }
-
 } // namespace ck::utils
diff --git a/include/ck/utility/numeric_limits.hpp b/include/ck/utility/numeric_limits.hpp
new file mode 100644
index 0000000000..e59b7eceaf
--- /dev/null
+++ b/include/ck/utility/numeric_limits.hpp
@@ -0,0 +1,555 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/utility/data_type.hpp"
+
+namespace ck {
+
+#if defined(__HIPCC_RTC__) || defined(CK_CODE_GEN_RTC)
+template <typename T>
+struct NumericLimits;
+
+template <>
+struct NumericLimits<int32_t>
+{
+    __host__ __device__ static constexpr int32_t Lowest() noexcept { return -2147483647 - 1; }
+
+    __host__ __device__ static constexpr int32_t Min() noexcept { return -2147483647 - 1; }
+
+    __host__ __device__ static constexpr int32_t Max() noexcept { return 2147483647; }
+
+    __host__ __device__ static constexpr int32_t Infinity() noexcept { return 0; }
+
+    __host__ __device__ static constexpr int32_t QuietNaN() { return 0; }
+};
+template <>
+struct NumericLimits<int16_t>
+{
+    __host__ __device__ static constexpr int16_t Lowest() noexcept { return -32768; }
+
+    __host__ __device__ static constexpr int16_t Min() noexcept { return -32768; }
+
+    __host__ __device__ static constexpr int16_t Max() noexcept { return 32767; }
+
+    __host__ __device__ static constexpr int16_t Infinity() noexcept { return 0; }
+
+    __host__ __device__ static constexpr int16_t QuietNaN() { return 0; }
+};
+
+template <>
+struct NumericLimits<int8_t>
+{
+    __host__ __device__ static constexpr int8_t Lowest() noexcept { return -128; }
+
+    __host__ __device__ static constexpr int8_t Min() noexcept { return -128; }
+
+    __host__ __device__ static constexpr int8_t Max() noexcept { return 127; }
+
+    __host__ __device__ static constexpr int8_t Infinity() noexcept { return 0; }
+
+    __host__ __device__ static constexpr int8_t QuietNaN() { return 0; }
+};
+
+template <>
+struct NumericLimits<uint32_t>
+{
+    __host__ __device__ static constexpr uint32_t Lowest() noexcept { return 0; }
+
+    __host__ __device__ static constexpr uint32_t Min() noexcept { return 0; }
+
+    __host__ __device__ static constexpr uint32_t Max() noexcept { return 4294967295U; }
+
+    __host__ __device__ static constexpr uint32_t Infinity() noexcept { return 0; }
+
+    __host__ __device__ static constexpr uint32_t QuietNaN() { return 0; }
+};
+
+template <>
+struct NumericLimits<uint16_t>
+{
+    __host__ __device__ static constexpr uint16_t Lowest() noexcept { return 0; }
+
+    __host__ __device__ static constexpr uint16_t Min() noexcept { return 0; }
+
+    __host__ __device__ static constexpr uint16_t Max() noexcept { return 65535U; }
+
+    __host__ __device__ static constexpr uint16_t Infinity() noexcept { return 0; }
+
+    __host__ __device__ static constexpr uint16_t QuietNaN() { return 0; }
+};
+
+template <>
+struct NumericLimits<float>
+{
+    static constexpr unsigned int binary_min    = 0x00800000;
+    static constexpr unsigned int binary_max    = 0x7F7FFFFF;
+    static constexpr unsigned int binary_lowest = 0xFF7FFFFF;
+    static constexpr unsigned int binary_qnan   = 0xFFC00001;
+    static constexpr unsigned int binary_inf    = 0x7F800000;
+
+    __host__ __device__ static constexpr float Min() { return bit_cast<float>(binary_min); }
+
+    __host__ __device__ static constexpr float Max() { return bit_cast<float>(binary_max); }
+
+    __host__ __device__ static constexpr float Lowest() { return bit_cast<float>(binary_lowest); }
+
+    __host__ __device__ static constexpr float QuietNaN() { return bit_cast<float>(binary_qnan); }
+
+    __host__ __device__ static constexpr float Infinity() { return bit_cast<float>(binary_inf); }
+};
+
+template <>
+struct NumericLimits<half_t>
+{
+    static constexpr unsigned short binary_min    = 0x0400;
+    static constexpr unsigned short binary_max    = 0x7BFF;
+    static constexpr unsigned short binary_lowest = 0xFBFF;
+    static constexpr unsigned short binary_qnan   = 0x7FFF;
+
+    __host__ __device__ static constexpr half_t Min() { return bit_cast<half_t>(binary_min); }
+
+    __host__ __device__ static constexpr half_t Max() { return bit_cast<half_t>(binary_max); }
+
+    __host__ __device__ static constexpr half_t Lowest() { return bit_cast<half_t>(binary_lowest); }
+
+    __host__ __device__ static constexpr half_t QuietNaN() { return bit_cast<half_t>(binary_qnan); }
+};
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+template <>
+struct NumericLimits<int4_t>
+{
+    __host__ __device__ static constexpr int4_t Min() { return int4_t(-8); }
+
+    __host__ __device__ static constexpr int4_t Max() { return int4_t(7); }
+
+    __host__ __device__ static constexpr int4_t Lowest() { return int4_t(-8); }
+};
+#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+
+template <>
+struct NumericLimits<f8_fnuz_t>
+{
+    // negative zero nan mode with exp bias = 8
+    static constexpr uint8_t binary_min    = 0x08; // 0b00001000
+    static constexpr uint8_t binary_max    = 0x7F; // 0b01111111
+    static constexpr uint8_t binary_lowest = 0xFF; // 0b11111111
+    static constexpr uint8_t binary_qnan   = 0x80; // 0b10000000
+    // ieee mode with exp bias = 7
+    // static constexpr uint8_t binary_min    = 0x08; // 0b00001000
+    // static constexpr uint8_t binary_max    = 0x77; // 0b01110111
+    // static constexpr uint8_t binary_lowest = 0xF7; // 0b11110111
+    // static constexpr uint8_t binary_qnan   = 0x79; // any sign, exp=1111, mant!=0
+
+    __host__ __device__ static constexpr f8_fnuz_t Min() { return f8_fnuz_t(binary_min); }
+
+    __host__ __device__ static constexpr f8_fnuz_t Max() { return f8_fnuz_t(binary_max); }
+
+    __host__ __device__ static constexpr f8_fnuz_t Lowest() { return f8_fnuz_t(binary_lowest); }
+
+    __host__ __device__ static constexpr f8_fnuz_t QuietNaN() { return f8_fnuz_t(binary_qnan); }
+};
+
+template <>
+struct NumericLimits<bf8_fnuz_t>
+{
+    // negative zero nan mode with exp bias = 16
+    static constexpr uint8_t binary_min    = 0x04; // 0b00000100
+    static constexpr uint8_t binary_max    = 0x7F; // 0b01111111
+    static constexpr uint8_t binary_lowest = 0xFF; // 0b11111111
+    static constexpr uint8_t binary_qnan   = 0x80; // 0b10000000
+    // ieee mode with exp bias = 15
+    // static constexpr uint8_t binary_min    = 0x04; // 0b00000100
+    // static constexpr uint8_t binary_max    = 0x7B; // 0b01111011
+    // static constexpr uint8_t binary_lowest = 0xFB; // 0b11111011
+    // static constexpr uint8_t binary_qnan   = 0x79; // any sign, exp=1111, mant!=
+
+    __host__ __device__ static constexpr bf8_fnuz_t Min() { return bf8_fnuz_t(binary_min); }
+
+    __host__ __device__ static constexpr bf8_fnuz_t Max() { return bf8_fnuz_t(binary_max); }
+
+    __host__ __device__ static constexpr bf8_fnuz_t Lowest() { return bf8_fnuz_t(binary_lowest); }
+
+    __host__ __device__ static constexpr bf8_fnuz_t QuietNaN() { return bf8_fnuz_t(binary_qnan); }
+};
+
+template <>
+struct NumericLimits<f8_ocp_t>
+{
+    static constexpr uint8_t binary_min    = 0x08; // 0b00001000 = 2^-6
+    static constexpr uint8_t binary_max    = 0x7E; // 0b01111110 = 448
+    static constexpr uint8_t binary_lowest = 0xFE; // 0b11111110 = -448
+    static constexpr uint8_t binary_qnan   = 0x7F; // 0b01111111
+
+    __host__ __device__ static constexpr f8_ocp_t Min() { return bit_cast<f8_ocp_t>(binary_min); }
+
+    __host__ __device__ static constexpr f8_ocp_t Max() { return bit_cast<f8_ocp_t>(binary_max); }
+
+    __host__ __device__ static constexpr f8_ocp_t Lowest()
+    {
+        return bit_cast<f8_ocp_t>(binary_lowest);
+    }
+
+    __host__ __device__ static constexpr f8_ocp_t QuietNaN()
+    {
+        return bit_cast<f8_ocp_t>(binary_qnan);
+    }
+};
+
+template <>
+struct NumericLimits<bf8_ocp_t>
+{
+    static constexpr uint8_t binary_min    = 0x04; // 0b00000100 = 2^-14
+    static constexpr uint8_t binary_max    = 0x7B; // 0b01111011 = 57344
+    static constexpr uint8_t binary_lowest = 0xFB; // 0b11111011 = -57344
+    static constexpr uint8_t binary_qnan   = 0x7D; // 0b01111101
+
+    __host__ __device__ static constexpr bf8_ocp_t Min() { return bit_cast<bf8_ocp_t>(binary_min); }
+
+    __host__ __device__ static constexpr bf8_ocp_t Max() { return bit_cast<bf8_ocp_t>(binary_max); }
+
+    __host__ __device__ static constexpr bf8_ocp_t Lowest()
+    {
+        return bit_cast<bf8_ocp_t>(binary_lowest);
+    }
+
+    __host__ __device__ static constexpr bf8_ocp_t QuietNaN()
+    {
+        return bit_cast<bf8_ocp_t>(binary_qnan);
+    }
+};
+
+template <>
+struct NumericLimits<f4_t>
+{
+    static constexpr uint8_t binary_min_normal    = 0x2; // 0b0010
+    static constexpr uint8_t binary_max_normal    = 0x7; // 0b0111
+    static constexpr uint8_t binary_lowest_normal = 0xF; // 0b1111
+    static constexpr uint8_t binary_min_subnorm   = 0x1; // 0b0001
+    static constexpr uint8_t binary_max_subnorm   = 0x1; // 0b0001
+
+    static constexpr float data_max_normal_number    = 6;
+    static constexpr float data_min_subnormal_number = 0.5;
+
+    __host__ __device__ static constexpr f4_t Min() { return f4_t(binary_min_normal); }
+    __host__ __device__ static constexpr f4_t Max() { return f4_t(binary_max_normal); }
+    __host__ __device__ static constexpr f4_t Lowest() { return f4_t(binary_lowest_normal); }
+    __host__ __device__ static constexpr f4_t MinSubnorm() { return f4_t(binary_min_subnorm); }
+    __host__ __device__ static constexpr f4_t MaxSubnorm() { return f4_t(binary_max_subnorm); }
+
+    __host__ __device__ static constexpr float DataMaxNorm() { return data_max_normal_number; }
+    __host__ __device__ static constexpr float DataMinSubnorm()
+    {
+        return data_min_subnormal_number;
+    }
+};
+
+template <>
+struct NumericLimits<f6_t>
+{
+    static constexpr uint8_t binary_min_normal    = 0x08; // 0b001000
+    static constexpr uint8_t binary_max_normal    = 0x1F; // 0b011111
+    static constexpr uint8_t binary_lowest_normal = 0x3F; // 0b111111
+    static constexpr uint8_t binary_min_subnorm   = 0x01; // 0b000001
+    static constexpr uint8_t binary_max_subnorm   = 0x07; // 0b000111
+
+    static constexpr float data_max_normal_number    = 7.5;
+    static constexpr float data_min_subnormal_number = 0.125;
+
+    __host__ __device__ static constexpr f6_t Min() { return f6_t(binary_min_normal & 0b111111); }
+    __host__ __device__ static constexpr f6_t Max() { return f6_t(binary_max_normal & 0b111111); }
+    __host__ __device__ static constexpr f6_t Lowest()
+    {
+        return f6_t(binary_lowest_normal & 0b111111);
+    }
+    __host__ __device__ static constexpr f6_t MinSubnorm()
+    {
+        return f6_t(binary_min_subnorm & 0b111111);
+    }
+    __host__ __device__ static constexpr f6_t MaxSubnorm()
+    {
+        return f6_t(binary_max_subnorm & 0b111111);
+    }
+
+    __host__ __device__ static constexpr float DataMaxNorm() { return data_max_normal_number; }
+    __host__ __device__ static constexpr float DataMinSubnorm()
+    {
+        return data_min_subnormal_number;
+    }
+};
+
+template <>
+struct NumericLimits<bf6_t>
+{
+    static constexpr uint8_t binary_min_normal    = 0x08; // 0b001000
+    static constexpr uint8_t binary_max_normal    = 0x1F; // 0b011111
+    static constexpr uint8_t binary_lowest_normal = 0x3F; // 0b111111
+    static constexpr uint8_t binary_min_subnorm   = 0x01; // 0b000001
+    static constexpr uint8_t binary_max_subnorm   = 0x03; // 0b000011
+
+    static constexpr float data_max_normal_number    = 28;
+    static constexpr float data_min_subnormal_number = 0.0625;
+
+    __host__ __device__ static constexpr bf6_t Min() { return bf6_t(binary_min_normal); }
+    __host__ __device__ static constexpr bf6_t Max() { return bf6_t(binary_max_normal); }
+    __host__ __device__ static constexpr bf6_t Lowest() { return bf6_t(binary_lowest_normal); }
+    __host__ __device__ static constexpr bf6_t MinSubnorm() { return bf6_t(binary_min_subnorm); }
+    __host__ __device__ static constexpr bf6_t MaxSubnorm() { return bf6_t(binary_max_subnorm); }
+
+    __host__ __device__ static constexpr float DataMaxNorm() { return data_max_normal_number; }
+    __host__ __device__ static constexpr float DataMinSubnorm()
+    {
+        return data_min_subnormal_number;
+    }
+};
+
+#else
+template <typename T>
+struct NumericLimits
+{
+    __host__ __device__ static constexpr T Min() { return std::numeric_limits<T>::min(); }
+    __host__ __device__ static constexpr T Max() { return std::numeric_limits<T>::max(); }
+    __host__ __device__ static constexpr T Lowest() { return std::numeric_limits<T>::lowest(); }
+    __host__ __device__ static constexpr T QuietNaN()
+    {
+        return std::numeric_limits<T>::quiet_NaN();
+    }
+    __host__ __device__ static constexpr T Infinity() { return std::numeric_limits<T>::infinity(); }
+};
+
+template <>
+struct NumericLimits<half_t>
+{
+    static constexpr unsigned short binary_min    = 0x0400;
+    static constexpr unsigned short binary_max    = 0x7BFF;
+    static constexpr unsigned short binary_lowest = 0xFBFF;
+    static constexpr unsigned short binary_qnan   = 0x7FFF;
+
+    __host__ __device__ static constexpr half_t Min() { return bit_cast<half_t>(binary_min); }
+
+    __host__ __device__ static constexpr half_t Max() { return bit_cast<half_t>(binary_max); }
+
+    __host__ __device__ static constexpr half_t Lowest() { return bit_cast<half_t>(binary_lowest); }
+
+    __host__ __device__ static constexpr half_t QuietNaN() { return bit_cast<half_t>(binary_qnan); }
+};
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+template <>
+struct NumericLimits<int4_t>
+{
+    __host__ __device__ static constexpr int4_t Min() { return int4_t(-8); }
+
+    __host__ __device__ static constexpr int4_t Max() { return int4_t(7); }
+
+    __host__ __device__ static constexpr int4_t Lowest() { return int4_t(-8); }
+};
+#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+
+template <>
+struct NumericLimits<f8_fnuz_t>
+{
+    // negative zero nan mode with exp bias = 8
+    static constexpr uint8_t binary_min    = 0x08; // 0b00001000
+    static constexpr uint8_t binary_max    = 0x7F; // 0b01111111
+    static constexpr uint8_t binary_lowest = 0xFF; // 0b11111111
+    static constexpr uint8_t binary_qnan   = 0x80; // 0b10000000
+    // ieee mode with exp bias = 7
+    // static constexpr uint8_t binary_min    = 0x08; // 0b00001000
+    // static constexpr uint8_t binary_max    = 0x77; // 0b01110111
+    // static constexpr uint8_t binary_lowest = 0xF7; // 0b11110111
+    // static constexpr uint8_t binary_qnan   = 0x79; // any sign, exp=1111, mant!=0
+
+    __host__ __device__ static constexpr f8_fnuz_t Min() { return f8_fnuz_t(binary_min); }
+
+    __host__ __device__ static constexpr f8_fnuz_t Max() { return f8_fnuz_t(binary_max); }
+
+    __host__ __device__ static constexpr f8_fnuz_t Lowest() { return f8_fnuz_t(binary_lowest); }
+
+    __host__ __device__ static constexpr f8_fnuz_t QuietNaN() { return f8_fnuz_t(binary_qnan); }
+};
+
+template <>
+struct NumericLimits<bf8_fnuz_t>
+{
+    // negative zero nan mode with exp bias = 16
+    static constexpr uint8_t binary_min    = 0x04; // 0b00000100
+    static constexpr uint8_t binary_max    = 0x7F; // 0b01111111
+    static constexpr uint8_t binary_lowest = 0xFF; // 0b11111111
+    static constexpr uint8_t binary_qnan   = 0x80; // 0b10000000
+    // ieee mode with exp bias = 15
+    // static constexpr uint8_t binary_min    = 0x04; // 0b00000100
+    // static constexpr uint8_t binary_max    = 0x7B; // 0b01111011
+    // static constexpr uint8_t binary_lowest = 0xFB; // 0b11111011
+    // static constexpr uint8_t binary_qnan   = 0x79; // any sign, exp=1111, mant!=
+
+    __host__ __device__ static constexpr bf8_fnuz_t Min() { return bf8_fnuz_t(binary_min); }
+
+    __host__ __device__ static constexpr bf8_fnuz_t Max() { return bf8_fnuz_t(binary_max); }
+
+    __host__ __device__ static constexpr bf8_fnuz_t Lowest() { return bf8_fnuz_t(binary_lowest); }
+
+    __host__ __device__ static constexpr bf8_fnuz_t QuietNaN() { return bf8_fnuz_t(binary_qnan); }
+};
+
+template <>
+struct NumericLimits<f8_ocp_t>
+{
+    static constexpr uint8_t binary_min    = 0x08; // 0b00001000 = 2^-6
+    static constexpr uint8_t binary_max    = 0x7E; // 0b01111110 = 448
+    static constexpr uint8_t binary_lowest = 0xFE; // 0b11111110 = -448
+    static constexpr uint8_t binary_qnan   = 0x7F; // 0b01111111
+
+    __host__ __device__ static constexpr f8_ocp_t Min() { return bit_cast<f8_ocp_t>(binary_min); }
+
+    __host__ __device__ static constexpr f8_ocp_t Max() { return bit_cast<f8_ocp_t>(binary_max); }
+
+    __host__ __device__ static constexpr f8_ocp_t Lowest()
+    {
+        return bit_cast<f8_ocp_t>(binary_lowest);
+    }
+
+    __host__ __device__ static constexpr f8_ocp_t QuietNaN()
+    {
+        return bit_cast<f8_ocp_t>(binary_qnan);
+    }
+};
+
+template <>
+struct NumericLimits<bf8_ocp_t>
+{
+    static constexpr uint8_t binary_min    = 0x04; // 0b00000100 = 2^-14
+    static constexpr uint8_t binary_max    = 0x7B; // 0b01111011 = 57344
+    static constexpr uint8_t binary_lowest = 0xFB; // 0b11111011 = -57344
+    static constexpr uint8_t binary_qnan   = 0x7D; // 0b01111101
+
+    __host__ __device__ static constexpr bf8_ocp_t Min() { return bit_cast<bf8_ocp_t>(binary_min); }
+
+    __host__ __device__ static constexpr bf8_ocp_t Max() { return bit_cast<bf8_ocp_t>(binary_max); }
+
+    __host__ __device__ static constexpr bf8_ocp_t Lowest()
+    {
+        return bit_cast<bf8_ocp_t>(binary_lowest);
+    }
+
+    __host__ __device__ static constexpr bf8_ocp_t QuietNaN()
+    {
+        return bit_cast<bf8_ocp_t>(binary_qnan);
+    }
+};
+
+template <>
+struct NumericLimits<f4_t>
+{
+    static constexpr uint8_t binary_min_normal    = 0x2; // 0b0010
+    static constexpr uint8_t binary_max_normal    = 0x7; // 0b0111
+    static constexpr uint8_t binary_lowest_normal = 0xF; // 0b1111
+    static constexpr uint8_t binary_min_subnorm   = 0x1; // 0b0001
+    static constexpr uint8_t binary_max_subnorm   = 0x1; // 0b0001
+
+    static constexpr float data_max_normal_number    = 6;
+    static constexpr float data_min_subnormal_number = 0.5;
+
+    __host__ __device__ static constexpr f4_t Min() { return f4_t(binary_min_normal); }
+    __host__ __device__ static constexpr f4_t Max() { return f4_t(binary_max_normal); }
+    __host__ __device__ static constexpr f4_t Lowest() { return f4_t(binary_lowest_normal); }
+    __host__ __device__ static constexpr f4_t MinSubnorm() { return f4_t(binary_min_subnorm); }
+    __host__ __device__ static constexpr f4_t MaxSubnorm() { return f4_t(binary_max_subnorm); }
+
+    __host__ __device__ static constexpr float DataMaxNorm() { return data_max_normal_number; }
+    __host__ __device__ static constexpr float DataMinSubnorm()
+    {
+        return data_min_subnormal_number;
+    }
+};
+
+template <>
+struct NumericLimits<f6_t>
+{
+    static constexpr uint8_t binary_min_normal    = 0x08; // 0b001000
+    static constexpr uint8_t binary_max_normal    = 0x1F; // 0b011111
+    static constexpr uint8_t binary_lowest_normal = 0x3F; // 0b111111
+    static constexpr uint8_t binary_min_subnorm   = 0x01; // 0b000001
+    static constexpr uint8_t binary_max_subnorm   = 0x07; // 0b000111
+
+    static constexpr float data_max_normal_number    = 7.5;
+    static constexpr float data_min_subnormal_number = 0.125;
+
+    __host__ __device__ static constexpr f6_t Min() { return f6_t(binary_min_normal & 0b111111); }
+    __host__ __device__ static constexpr f6_t Max() { return f6_t(binary_max_normal & 0b111111); }
+    __host__ __device__ static constexpr f6_t Lowest()
+    {
+        return f6_t(binary_lowest_normal & 0b111111);
+    }
+    __host__ __device__ static constexpr f6_t MinSubnorm()
+    {
+        return f6_t(binary_min_subnorm & 0b111111);
+    }
+    __host__ __device__ static constexpr f6_t MaxSubnorm()
+    {
+        return f6_t(binary_max_subnorm & 0b111111);
+    }
+
+    __host__ __device__ static constexpr float DataMaxNorm() { return data_max_normal_number; }
+    __host__ __device__ static constexpr float DataMinSubnorm()
+    {
+        return data_min_subnormal_number;
+    }
+};
+
+template <>
+struct NumericLimits<bf6_t>
+{
+    static constexpr uint8_t binary_min_normal    = 0x08; // 0b001000
+    static constexpr uint8_t binary_max_normal    = 0x1F; // 0b011111
+    static constexpr uint8_t binary_lowest_normal = 0x3F; // 0b111111
+    static constexpr uint8_t binary_min_subnorm   = 0x01; // 0b000001
+    static constexpr uint8_t binary_max_subnorm   = 0x03; // 0b000011
+
+    static constexpr float data_max_normal_number    = 28;
+    static constexpr float data_min_subnormal_number = 0.0625;
+
+    __host__ __device__ static constexpr bf6_t Min() { return bf6_t(binary_min_normal); }
+    __host__ __device__ static constexpr bf6_t Max() { return bf6_t(binary_max_normal); }
+    __host__ __device__ static constexpr bf6_t Lowest() { return bf6_t(binary_lowest_normal); }
+    __host__ __device__ static constexpr bf6_t MinSubnorm() { return bf6_t(binary_min_subnorm); }
+    __host__ __device__ static constexpr bf6_t MaxSubnorm() { return bf6_t(binary_max_subnorm); }
+
+    __host__ __device__ static constexpr float DataMaxNorm() { return data_max_normal_number; }
+    __host__ __device__ static constexpr float DataMinSubnorm()
+    {
+        return data_min_subnormal_number;
+    }
+};
+
+#endif
+
+template <>
+struct NumericLimits<e8m0_bexp_t>
+{
+    static constexpr e8m0_bexp_t binary_min  = 0x00; // 0b00000000
+    static constexpr e8m0_bexp_t binary_max  = 0xFE; // 0b11111110
+    static constexpr e8m0_bexp_t binary_qnan = 0xFF; // 0b11111111
+    static constexpr e8m0_bexp_t binary_1    = 0x7F; // 0b01111111
+    static constexpr e8m0_bexp_t binary_2    = 0x80; // 0b10000000
+    static constexpr e8m0_bexp_t binary_3    = 0x82; // 0b10000010
+    static constexpr e8m0_bexp_t binary_135  = 0x87; // 0b10000111
+    static constexpr e8m0_bexp_t binary_142  = 0x8E; // 0b10001110
+
+    __host__ __device__ static constexpr e8m0_bexp_t Min() { return e8m0_bexp_t(binary_min); }
+    __host__ __device__ static constexpr e8m0_bexp_t Max() { return e8m0_bexp_t(binary_max); }
+    __host__ __device__ static constexpr e8m0_bexp_t QuietNaN() { return e8m0_bexp_t(binary_qnan); }
+    __host__ __device__ static constexpr e8m0_bexp_t Binary_1() { return e8m0_bexp_t(binary_1); }
+    __host__ __device__ static constexpr e8m0_bexp_t Binary_2() { return e8m0_bexp_t(binary_2); }
+    __host__ __device__ static constexpr e8m0_bexp_t Binary_3() { return e8m0_bexp_t(binary_3); }
+    __host__ __device__ static constexpr e8m0_bexp_t Binary_135()
+    {
+        return e8m0_bexp_t(binary_135);
+    }
+    __host__ __device__ static constexpr e8m0_bexp_t Binary_142()
+    {
+        return e8m0_bexp_t(binary_142);
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/utility/numeric_utils.hpp b/include/ck/utility/numeric_utils.hpp
new file mode 100644
index 0000000000..726f667518
--- /dev/null
+++ b/include/ck/utility/numeric_utils.hpp
@@ -0,0 +1,199 @@
+// SPDX-License-Identifier: MIT
+// // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/utility/data_type.hpp"
+
+namespace ck {
+
+template <typename T>
+struct NumericUtils
+{
+};
+
+template <>
+struct NumericUtils<e8m0_bexp_t>
+{
+    static constexpr int exp  = 8;
+    static constexpr int mant = 0;
+    static constexpr int bias = 127;
+
+    static constexpr int unbiased_exp_min = -127;
+    static constexpr int unbiased_exp_max = 127;
+    static constexpr int biased_exp_min   = 0;
+    static constexpr int biased_exp_max   = 254;
+
+    using bitwise_type = uint8_t;
+};
+
+template <>
+struct NumericUtils<float>
+{
+    static constexpr int exp            = 8;
+    static constexpr int mant           = 23;
+    static constexpr int bias           = 127;
+    static constexpr uint32_t nan_mask  = 0x7F800000;
+    static constexpr uint32_t head_mask = 0xFF800000;
+    static constexpr uint32_t mant_mask = 0x7FFFFF;
+    static constexpr uint32_t exp_mask  = 0xFF;
+    static constexpr uint32_t Inf       = 0x7F800000;
+    static constexpr uint32_t NegInf    = 0xFF800000;
+    static constexpr uint32_t NaN       = 0x7F800001;
+    static constexpr uint32_t Neg0      = 0x80000000;
+    static constexpr bool has_inf       = true;
+    using bitwise_type                  = uint32_t;
+};
+
+template <>
+struct NumericUtils<half_t>
+{
+    static constexpr int exp            = 5;
+    static constexpr int mant           = 10;
+    static constexpr int bias           = 15;
+    static constexpr uint16_t nan_mask  = 0x7C00;
+    static constexpr uint16_t head_mask = 0xFC00;
+    static constexpr uint16_t mant_mask = 0x3FF;
+    static constexpr uint16_t exp_mask  = 0x1F;
+    static constexpr uint32_t Inf       = 0x7C00;
+    static constexpr uint32_t NegInf    = 0xFC00;
+    static constexpr uint32_t NaN       = 0x7C01;
+    static constexpr uint32_t Neg0      = 0x8000;
+    static constexpr bool has_inf       = true;
+    using bitwise_type                  = uint16_t;
+};
+
+template <>
+struct NumericUtils<bhalf_t>
+{
+    static constexpr int exp  = 8;
+    static constexpr int mant = 7;
+    static constexpr int bias = 128; // negative zero nan mode
+    // static constexpr int bias = 127; // ieee mode
+};
+
+template <>
+struct NumericUtils<f8_fnuz_t>
+{
+    static constexpr int exp  = 4;
+    static constexpr int mant = 3;
+    static constexpr int bias = 8; // negative zero nan mode
+    // static constexpr int bias = 7; // ieee mode
+    static constexpr bool has_inf = false;
+};
+
+template <>
+struct NumericUtils<bf8_fnuz_t>
+{
+    static constexpr int exp  = 5;
+    static constexpr int mant = 2;
+    static constexpr int bias = 16; // negative zero nan mode
+    // static constexpr int bias = 15; // ieee mode
+    static constexpr bool has_inf = false;
+};
+template <>
+struct NumericUtils<f8_ocp_t>
+{
+    static constexpr int exp  = 4;
+    static constexpr int mant = 3;
+    static constexpr int bias = 7;
+};
+
+template <>
+struct NumericUtils<bf8_ocp_t>
+{
+    static constexpr int exp  = 5;
+    static constexpr int mant = 2;
+    static constexpr int bias = 15;
+};
+
+template <>
+struct NumericUtils<f4_t>
+{
+    static constexpr int exp           = 2;
+    static constexpr int mant          = 1;
+    static constexpr int bias          = 1;
+    static constexpr uint32_t sr_shift = 10;
+
+    static constexpr int unbiased_exp_min = 0;
+    static constexpr int unbiased_exp_max = 2;
+    static constexpr int biased_exp_min   = 1;
+    static constexpr int biased_exp_max   = 3;
+
+    static constexpr uint8_t positive_zero_mask = 0b0000;
+    static constexpr uint8_t negative_zero_mask = 0b1000;
+
+    static constexpr uint8_t one_mask      = 0b0010;
+    static constexpr uint8_t set_sign_mask = 0b0111;
+
+    static constexpr uint8_t data_max_positive_normal_mask = 0b0111;
+    static constexpr uint8_t data_max_negative_normal_mask = 0b1111;
+
+    static constexpr uint8_t data_max_positive_subnormal_mask = 0b0001;
+    static constexpr uint8_t data_max_negative_subnormal_mask = 0b1001;
+
+    static constexpr bool has_inf = false;
+
+    using bitwise_type = uint8_t;
+};
+
+template <>
+struct NumericUtils<f6_t>
+{
+    static constexpr int exp           = 2;
+    static constexpr int mant          = 3;
+    static constexpr int bias          = 1;
+    static constexpr uint32_t sr_shift = 12;
+
+    static constexpr int unbiased_exp_min = 0;
+    static constexpr int unbiased_exp_max = 2;
+    static constexpr int biased_exp_min   = 1;
+    static constexpr int biased_exp_max   = 3;
+
+    static constexpr uint8_t positive_zero_mask = 0b000000;
+    static constexpr uint8_t negative_zero_mask = 0b100000;
+
+    static constexpr uint8_t set_sign_mask = 0b011111;
+
+    static constexpr uint8_t data_max_positive_normal_mask = 0b011111;
+    static constexpr uint8_t data_max_negative_normal_mask = 0b111111;
+
+    static constexpr uint8_t data_max_positive_subnormal_mask = 0b000111;
+    static constexpr uint8_t data_max_negative_subnormal_mask = 0b100111;
+
+    static constexpr bool has_inf  = false;
+    static constexpr bool has_nan  = false;
+    static constexpr bool has_zero = true;
+
+    using bitwise_type = uint8_t;
+};
+
+template <>
+struct NumericUtils<bf6_t>
+{
+    static constexpr int exp           = 3;
+    static constexpr int mant          = 2;
+    static constexpr int bias          = 3;
+    static constexpr uint32_t sr_shift = 11;
+
+    static constexpr int unbiased_exp_min = -2;
+    static constexpr int unbiased_exp_max = 4;
+    static constexpr int biased_exp_min   = 1;
+    static constexpr int biased_exp_max   = 7;
+
+    static constexpr uint8_t positive_zero_mask = 0b000000;
+    static constexpr uint8_t negative_zero_mask = 0b100000;
+
+    static constexpr uint8_t set_sign_mask = 0b011111;
+
+    static constexpr uint8_t data_max_positive_normal_mask = 0b011111;
+    static constexpr uint8_t data_max_negative_normal_mask = 0b111111;
+
+    static constexpr uint8_t data_max_positive_subnormal_mask = 0b000011;
+    static constexpr uint8_t data_max_negative_subnormal_mask = 0b100011;
+
+    static constexpr bool has_inf  = false;
+    static constexpr bool has_nan  = false;
+    static constexpr bool has_zero = true;
+
+    using bitwise_type = uint8_t;
+};
+} // namespace ck
diff --git a/include/ck/utility/scaled_type_convert.hpp b/include/ck/utility/scaled_type_convert.hpp
index 9a9c53caec..f3e2bd3dd9 100644
--- a/include/ck/utility/scaled_type_convert.hpp
+++ b/include/ck/utility/scaled_type_convert.hpp
@@ -67,7 +67,7 @@ inline __host__ float2_t scaled_type_convert<float2_t, f8x2_ocp_t>(e8m0_bexp_t s
 #endif
 {
 #if CK_MX_FP8_CVT_FAST_PATH
-    return fp8_impl::cast_to_f32x2_from_f8x2_scaled<f8_ocp_t::default_interpret>(
+    return fp8_impl::cast_to_f32_from_f8_scaled<f8_ocp_t::default_interpret>(
         type_convert<float>(scale), x.AsType<fp8_impl::fp8x2_storage_t>()[Number<0>{}]);
 #else
     return float2_t{scaled_type_convert<float>(scale, x.AsType<f8_ocp_t>()[Number<0>{}]),
@@ -86,7 +86,7 @@ inline __host__ float2_t scaled_type_convert<float2_t, bf8x2_ocp_t>(e8m0_bexp_t
 #endif
 {
 #if CK_MX_FP8_CVT_FAST_PATH
-    return fp8_impl::cast_to_f32x2_from_f8x2_scaled<bf8_ocp_t::default_interpret>(
+    return fp8_impl::cast_to_f32_from_f8_scaled<bf8_ocp_t::default_interpret>(
         type_convert<float>(scale), x.AsType<fp8_impl::fp8x2_storage_t>()[Number<0>{}]);
 #else
     return float2_t{scaled_type_convert<float>(scale, x.AsType<bf8_ocp_t>()[Number<0>{}]),
diff --git a/include/ck/utility/sequence.hpp b/include/ck/utility/sequence.hpp
index 25dae4e335..497625f7e2 100644
--- a/include/ck/utility/sequence.hpp
+++ b/include/ck/utility/sequence.hpp
@@ -184,6 +184,21 @@ struct Sequence
     }
 };
 
+namespace impl {
+template <typename T, T... Ints>
+struct __integer_sequence;
+
+template <index_t... Ints>
+struct __integer_sequence<index_t, Ints...>
+{
+    using seq_type = Sequence<Ints...>;
+};
+} // namespace impl
+
+template <index_t N>
+using make_index_sequence =
+    typename __make_integer_seq<impl::__integer_sequence, index_t, N>::seq_type;
+
 // merge sequence
 template <typename Seq, typename... Seqs>
 struct sequence_merge
@@ -256,6 +271,18 @@ struct arithmetic_sequence_gen
     using type = typename conditional<kHasContent, type0, type1>::type;
 };
 
+template <index_t IEnd>
+struct arithmetic_sequence_gen<0, IEnd, 1>
+{
+    template <typename T, T... Ints>
+    struct WrapSequence
+    {
+        using type = Sequence<Ints...>;
+    };
+    // https://reviews.llvm.org/D13786
+    using type = typename __make_integer_seq<WrapSequence, index_t, IEnd>::type;
+};
+
 // uniform sequence
 template <index_t NSize, index_t I>
 struct uniform_sequence_gen
diff --git a/include/ck/utility/tuple_helper.hpp b/include/ck/utility/tuple_helper.hpp
index b4f1545aa9..ec055fb2a2 100644
--- a/include/ck/utility/tuple_helper.hpp
+++ b/include/ck/utility/tuple_helper.hpp
@@ -11,8 +11,20 @@
 
 namespace ck {
 
+template <typename F, index_t... ids>
+__host__ __device__ constexpr auto generate_tuple_for(F&& f, Sequence<ids...>)
+{
+    return make_tuple(f(Number<ids>{})...);
+}
+
 template <typename F, index_t N>
 __host__ __device__ constexpr auto generate_tuple(F&& f, Number<N>)
+{
+    return generate_tuple_for(f, make_index_sequence<N>{});
+}
+
+template <typename F, index_t N>
+__host__ __device__ constexpr auto generate_tuple(F&& f, LongNumber<N>)
 {
     return unpack([&f](auto&&... xs) { return make_tuple(f(xs)...); },
                   typename arithmetic_sequence_gen<0, N, 1>::type{});
diff --git a/include/ck/utility/type_convert.hpp b/include/ck/utility/type_convert.hpp
index 51836fd60b..04ae046ac8 100644
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -14,7 +14,7 @@
 
 namespace ck {
 // Define the common macro for MI300 models
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
+#if defined(__gfx942__) || defined(__gfx950__)
 #define __gfx94__
 #endif
 
@@ -117,7 +117,7 @@ inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, float>(float
 #if CK_USE_RNE_BF16_CONVERSION
     return bf16_convert_rtn<bhalf_t>(x);
 #else
-    return uint16_t(u.int32 >> 16);
+    return uint16_t(static_cast<uint32_t>(x) >> 16);
 #endif
 }
 
@@ -356,6 +356,180 @@ inline __host__ __device__ bf8_fnuz_t f8_convert_sr<bf8_fnuz_t, half_t>(half_t x
 #endif
 }
 
+/**
+ * @brief Converts a float to a 8-bit float type (f8_ocp_t) using stochastic rounding.
+ *
+ * @param x     The input float value.
+ * @return      The converted f8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ f8_ocp_t f8_convert_sr<f8_ocp_t, float>(float x)
+{
+    return f8_ocp_t{
+        fp8_impl::cvt_float_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation, true>(
+            x)};
+}
+
+/**
+ * @brief Converts a vector of 2 floats to a vector of 2 8-bit float types (f8_ocp_t) using
+ * stochastic rounding.
+ *
+ * @param x     The input vector of 2 floats.
+ * @return      The converted vector of 2 f8_ocp_t.
+ */
+template <>
+inline __host__ __device__ f8x2_ocp_t f8_convert_sr<f8x2_ocp_t, float2_t>(float2_t x)
+{
+    return f8x2_ocp_t{
+        fp8_impl::cvt_float_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation, true>(
+            x)};
+}
+
+/**
+ * @brief Converts a float to a 8-bit float type (bf8_ocp_t) using stochastic rounding.
+ *
+ * @param x     The input float value.
+ * @return      The converted bf8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ bf8_ocp_t f8_convert_sr<bf8_ocp_t, float>(float x)
+{
+    return bf8_ocp_t{fp8_impl::cvt_float_to_fp8<bf8_ocp_t::default_interpret,
+                                                bf8_ocp_t::default_saturation,
+                                                true>(x)};
+}
+
+/**
+ * @brief Converts a vector of 2 floats to a vector of 2 8-bit float types (bf8_ocp_t) using
+ * stochastic rounding.
+ *
+ * @param x     The input vector of 2 floats.
+ * @return      The converted vector of 2 bf8_ocp_t.
+ */
+template <>
+inline __host__ __device__ bf8x2_ocp_t f8_convert_sr<bf8x2_ocp_t, float2_t>(float2_t x)
+{
+    return bf8x2_ocp_t{fp8_impl::cvt_float_to_fp8<bf8_ocp_t::default_interpret,
+                                                  bf8_ocp_t::default_saturation,
+                                                  true>(x)};
+}
+
+/**
+ * @brief Converts a half_t to a 8-bit float type (f8_ocp_t) using stochastic rounding.
+ *
+ * @param x     The input half_t value.
+ * @return      The converted f8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ f8_ocp_t f8_convert_sr<f8_ocp_t, half_t>(half_t x)
+{
+    return f8_ocp_t{fp8_impl::cvt_half_t_to_fp8<f8_ocp_t::default_interpret,
+                                                f8_ocp_t::default_saturation,
+                                                true>(x)};
+}
+
+/**
+ * @brief Converts a vector of 2 half_t to a vector of 2 8-bit float types (f8_ocp_t) using
+ * stochastic rounding.
+ *
+ * @param x     The input vector of 2 half_t.
+ * @return      The converted vector of 2 f8_ocp_t.
+ */
+template <>
+inline __host__ __device__ f8x2_ocp_t f8_convert_sr<f8x2_ocp_t, half2_t>(half2_t x)
+{
+    return f8x2_ocp_t{fp8_impl::cvt_half_t_to_fp8<f8_ocp_t::default_interpret,
+                                                  f8_ocp_t::default_saturation,
+                                                  true>(x)};
+}
+
+/**
+ * @brief Converts a half_t to a 8-bit half_t type (bf8_ocp_t) using stochastic rounding.
+ *
+ * @param x     The input half_t value.
+ * @return      The converted bf8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ bf8_ocp_t f8_convert_sr<bf8_ocp_t, half_t>(half_t x)
+{
+    return bf8_ocp_t{fp8_impl::cvt_half_t_to_fp8<bf8_ocp_t::default_interpret,
+                                                 bf8_ocp_t::default_saturation,
+                                                 true>(x)};
+}
+
+/**
+ * @brief Converts a vector of 2 half_t to a vector of 2 8-bit float types (bf8_ocp_t) using
+ * stochastic rounding.
+ *
+ * @param x     The input vector of 2 half_t.
+ * @return      The converted vector of 2 bf8_ocp_t.
+ */
+template <>
+inline __host__ __device__ bf8x2_ocp_t f8_convert_sr<bf8x2_ocp_t, half2_t>(half2_t x)
+{
+    return bf8x2_ocp_t{fp8_impl::cvt_half_t_to_fp8<bf8_ocp_t::default_interpret,
+                                                   bf8_ocp_t::default_saturation,
+                                                   true>(x)};
+}
+
+/**
+ * @brief Converts a bhalf_t to a 8-bit float type (f8_ocp_t) using stochastic rounding.
+ *
+ * @param x     The input bhalf_t value.
+ * @return      The converted f8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ f8_ocp_t f8_convert_sr<f8_ocp_t, bhalf_t>(bhalf_t x)
+{
+    return f8_ocp_t{fp8_impl::cvt_bhalf_t_to_fp8<f8_ocp_t::default_interpret,
+                                                 f8_ocp_t::default_saturation,
+                                                 true>(x)};
+}
+
+/**
+ * @brief Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (f8_ocp_t) using
+ * stochastic rounding.
+ *
+ * @param x     The input vector of 2 bhalf_t.
+ * @return      The converted vector of 2 f8_ocp_t.
+ */
+template <>
+inline __host__ __device__ f8x2_ocp_t f8_convert_sr<f8x2_ocp_t, bhalf2_t>(bhalf2_t x)
+{
+    return f8x2_ocp_t{fp8_impl::cvt_bhalf_t_to_fp8<f8_ocp_t::default_interpret,
+                                                   f8_ocp_t::default_saturation,
+                                                   true>(x)};
+}
+
+/**
+ * @brief Converts a bhalf_t to a 8-bit half_t type (bf8_ocp_t) using stochastic rounding.
+ *
+ * @param x     The input bhalf_t value.
+ * @return      The converted bf8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ bf8_ocp_t f8_convert_sr<bf8_ocp_t, bhalf_t>(bhalf_t x)
+{
+    return bf8_ocp_t{fp8_impl::cvt_bhalf_t_to_fp8<bf8_ocp_t::default_interpret,
+                                                  bf8_ocp_t::default_saturation,
+                                                  true>(x)};
+}
+
+/**
+ * @brief Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (bf8_ocp_t) using
+ * stochastic rounding.
+ *
+ * @param x     The input vector of 2 bhalf_t.
+ * @return      The converted vector of 2 bf8_ocp_t.
+ */
+template <>
+inline __host__ __device__ bf8x2_ocp_t f8_convert_sr<bf8x2_ocp_t, bhalf2_t>(bhalf2_t x)
+{
+    return bf8x2_ocp_t{fp8_impl::cvt_bhalf_t_to_fp8<bf8_ocp_t::default_interpret,
+                                                    bf8_ocp_t::default_saturation,
+                                                    true>(x)};
+}
+
 // Declare a template function for fp8 conversion using RNE
 template <typename Y, typename X>
 __host__ __device__ constexpr Y f8_convert_rne(X x);
@@ -466,6 +640,172 @@ inline __host__ __device__ bf8_fnuz_t f8_convert_rne<bf8_fnuz_t, half_t>(half_t
 #endif
 }
 
+/**
+ * @brief Converts a float to a 8-bit float type (f8_ocp_t) using rounding to nearest/even.
+ *
+ * @param x     The input float value.
+ * @return      The converted f8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ f8_ocp_t f8_convert_rne<f8_ocp_t, float>(float x)
+{
+    return f8_ocp_t{
+        fp8_impl::cvt_float_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation>(x)};
+}
+
+/**
+ * @brief Converts a vector of 2 floats to a vector of 2 8-bit float types (f8_ocp_t) using rounding
+ * to nearest/even.
+ *
+ * @param x     The input vector of 2 floats.
+ * @return      The converted vector of 2 f8_ocp_t.
+ */
+template <>
+inline __host__ __device__ f8x2_ocp_t f8_convert_rne<f8x2_ocp_t, float2_t>(float2_t x)
+{
+    return f8x2_ocp_t{
+        fp8_impl::cvt_float_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation>(x)};
+}
+
+/**
+ * @brief Converts a float to a 8-bit float type (bf8_ocp_t) using rounding to nearest/even.
+ *
+ * @param x     The input float value.
+ * @return      The converted bf8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ bf8_ocp_t f8_convert_rne<bf8_ocp_t, float>(float x)
+{
+    return bf8_ocp_t{
+        fp8_impl::cvt_float_to_fp8<bf8_ocp_t::default_interpret, bf8_ocp_t::default_saturation>(x)};
+}
+
+/**
+ * @brief Converts a vector of 2 floats to a vector of 2 8-bit float types (bf8_ocp_t) using
+ * rounding to nearest/even.
+ *
+ * @param x     The input vector of 2 floats.
+ * @return      The converted vector of 2 bf8_ocp_t.
+ */
+template <>
+inline __host__ __device__ bf8x2_ocp_t f8_convert_rne<bf8x2_ocp_t, float2_t>(float2_t x)
+{
+    return bf8x2_ocp_t{
+        fp8_impl::cvt_float_to_fp8<bf8_ocp_t::default_interpret, bf8_ocp_t::default_saturation>(x)};
+}
+
+/**
+ * @brief Converts a half_t to a 8-bit float type (f8_ocp_t) using rounding to nearest/even.
+ *
+ * @param x     The input half_t value.
+ * @return      The converted f8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ f8_ocp_t f8_convert_rne<f8_ocp_t, half_t>(half_t x)
+{
+    return f8_ocp_t{
+        fp8_impl::cvt_half_t_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation>(x)};
+}
+
+/**
+ * @brief Converts a vector of 2 half_t to a vector of 2 8-bit float types (f8_ocp_t) using rounding
+ * to nearest/even.
+ *
+ * @param x     The input vector of 2 half_t.
+ * @return      The converted vector of 2 f8_ocp_t.
+ */
+template <>
+inline __host__ __device__ f8x2_ocp_t f8_convert_rne<f8x2_ocp_t, half2_t>(half2_t x)
+{
+    return f8x2_ocp_t{
+        fp8_impl::cvt_half_t_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation>(x)};
+}
+
+/**
+ * @brief Converts a half_t to a 8-bit half_t type (bf8_ocp_t) using rounding to nearest/even.
+ *
+ * @param x     The input half_t value.
+ * @return      The converted bf8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ bf8_ocp_t f8_convert_rne<bf8_ocp_t, half_t>(half_t x)
+{
+    return bf8_ocp_t{
+        fp8_impl::cvt_half_t_to_fp8<bf8_ocp_t::default_interpret, bf8_ocp_t::default_saturation>(
+            x)};
+}
+
+/**
+ * @brief Converts a vector of 2 half_t to a vector of 2 8-bit float types (bf8_ocp_t) using
+ * rounding to nearest/even.
+ *
+ * @param x     The input vector of 2 half_t.
+ * @return      The converted vector of 2 bf8_ocp_t.
+ */
+template <>
+inline __host__ __device__ bf8x2_ocp_t f8_convert_rne<bf8x2_ocp_t, half2_t>(half2_t x)
+{
+    return bf8x2_ocp_t{
+        fp8_impl::cvt_half_t_to_fp8<bf8_ocp_t::default_interpret, bf8_ocp_t::default_saturation>(
+            x)};
+}
+
+/**
+ * @brief Converts a bhalf_t to a 8-bit float type (f8_ocp_t) using rounding to nearest/even.
+ *
+ * @param x     The input bhalf_t value.
+ * @return      The converted f8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ f8_ocp_t f8_convert_rne<f8_ocp_t, bhalf_t>(bhalf_t x)
+{
+    return f8_ocp_t{
+        fp8_impl::cvt_bhalf_t_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation>(x)};
+}
+
+/**
+ * @brief Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (f8_ocp_t) using
+ * rounding to nearest/even.
+ *
+ * @param x     The input vector of 2 bhalf_t.
+ * @return      The converted vector of 2 f8_ocp_t.
+ */
+template <>
+inline __host__ __device__ f8x2_ocp_t f8_convert_rne<f8x2_ocp_t, bhalf2_t>(bhalf2_t x)
+{
+    return f8x2_ocp_t{
+        fp8_impl::cvt_bhalf_t_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation>(x)};
+}
+
+/**
+ * @brief Converts a bhalf_t to a 8-bit half_t type (bf8_ocp_t) using rounding to nearest/even.
+ *
+ * @param x     The input bhalf_t value.
+ * @return      The converted bf8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ bf8_ocp_t f8_convert_rne<bf8_ocp_t, bhalf_t>(bhalf_t x)
+{
+    return bf8_ocp_t{
+        fp8_impl::cvt_bhalf_t_to_fp8<bf8_ocp_t::default_interpret, bf8_ocp_t::default_saturation>(
+            x)};
+}
+
+/**
+ * @brief Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (bf8_ocp_t) using
+ * rounding to nearest/even.
+ *
+ * @param x     The input vector of 2 bhalf_t.
+ * @return      The converted vector of 2 bf8_ocp_t.
+ */
+template <>
+inline __host__ __device__ bf8x2_ocp_t f8_convert_rne<bf8x2_ocp_t, bhalf2_t>(bhalf2_t x)
+{
+    return bf8x2_ocp_t{
+        fp8_impl::cvt_bhalf_t_to_fp8<bf8_ocp_t::default_interpret, bf8_ocp_t::default_saturation>(
+            x)};
+}
+
 // convert fp32 to fp8
 template <>
 inline __host__ __device__ f8_fnuz_t type_convert<f8_fnuz_t, float>(float x)
@@ -477,17 +817,6 @@ inline __host__ __device__ f8_fnuz_t type_convert<f8_fnuz_t, float>(float x)
 #endif
 }
 
-// convert fp32 to fp8
-template <>
-inline __host__ __device__ f8_ocp_t type_convert<f8_ocp_t, float>(float x)
-{
-#if CK_USE_SR_F8_CONVERSION
-    return f8_convert_sr<f8_ocp_t>(x);
-#else
-    return f8_convert_rne<f8_ocp_t>(x);
-#endif
-}
-
 // convert fp8 to fp32
 template <>
 inline __host__ __device__ float type_convert<float, f8_fnuz_t>(f8_fnuz_t x)
@@ -524,12 +853,39 @@ inline __host__ __device__ float2_t type_convert<float2_t, f8x2_fnuz_t>(f8x2_fnu
 #endif
 }
 
+/**
+ * @brief Converts a f8_ocp_t value to a float value.
+ *
+ * @param x     The input f8_ocp_t value.
+ * @return      The converted float value.
+ */
+template <>
+inline __host__ __device__ float type_convert<float, f8_ocp_t>(f8_ocp_t x)
+{
+#if CK_OCP_FP8_CVT_FAST_PATH
+    union
+    {
+        unsigned int i32val;
+        fp8_storage_t i8val[4];
+    } val;
+    val.i8val[0] = x.data;
+    return __builtin_amdgcn_cvt_f32_fp8(val.i32val, 0);
+#else
+    return fp8_impl::cast_from_f8<float, f8_ocp_t::wm, f8_ocp_t::we, false>(x.data);
+#endif
+}
+
+/**
+ * @brief Converts a vector of 2 f8_ocp_t values to a vector of 2 float values.
+ *
+ * @param x     The input vector of 2 f8_ocp_t values.
+ * @return      The converted vector of 2 float values.
+ */
 template <>
 inline __host__ __device__ float2_t type_convert<float2_t, f8x2_ocp_t>(f8x2_ocp_t x)
 {
 #if CK_OCP_FP8_CVT_FAST_PATH
-    return fp8_impl::cast_to_f32x2_from_f8x2<f8_ocp_t::default_interpret>(
-        x.AsType<fp8_impl::fp8x2_storage_t>()[Number<0>{}]);
+    return __builtin_amdgcn_cvt_pk_f32_fp8(bit_cast<uint16_t>(x), false);
 #else
     return float2_t{fp8_impl::cast_from_f8<float, f8_ocp_t::wm, f8_ocp_t::we, false>(
                         x.AsType<fp8_storage_t>()[Number<0>{}]),
@@ -538,6 +894,229 @@ inline __host__ __device__ float2_t type_convert<float2_t, f8x2_ocp_t>(f8x2_ocp_
 #endif
 }
 
+/**
+ * @brief Converts a f8_ocp_t value to a half_t value.
+ *
+ * @param x     The input f8_ocp_t value.
+ * @return      The converted half_t value.
+ */
+template <>
+inline __host__ __device__ half_t type_convert<half_t, f8_ocp_t>(f8_ocp_t x)
+{
+#if defined(__gfx950__)
+    union
+    {
+        uint16_t i16val;
+        fp8_storage_t i8val[2];
+    } input;
+    input.i8val[0] = x.data;
+
+    union
+    {
+        half2_t half_vec;
+        half_t half_arr[2];
+    } output;
+    output.half_vec = __builtin_amdgcn_cvt_scalef32_pk_f16_fp8(input.i16val, /*scale*/ 1.f, 0);
+
+    return output.half_arr[0];
+#else
+    return fp8_impl::cast_from_f8<half_t, f8_ocp_t::wm, f8_ocp_t::we, false>(x.data);
+#endif
+}
+
+/**
+ * @brief Converts a vector of 2 f8_ocp_t values to a vector of 2 half_t values.
+ *
+ * @param x     The input vector of 2 f8_ocp_t values.
+ * @return      The converted vector of 2 half_t values.
+ */
+template <>
+inline __host__ __device__ half2_t type_convert<half2_t, f8x2_ocp_t>(f8x2_ocp_t x)
+{
+#if defined(__gfx950__)
+    return __builtin_amdgcn_cvt_scalef32_pk_f16_fp8(bit_cast<uint16_t>(x), /*scale*/ 1.f, 0);
+#else
+    return half2_t{type_convert<half_t>(float(x.AsType<f8_ocp_t>()[Number<0>{}])),
+                   type_convert<half_t>(float(x.AsType<f8_ocp_t>()[Number<1>{}]))};
+#endif
+}
+
+/**
+ * @brief Converts a f8_ocp_t value to a bhalf_t value.
+ *
+ * @param x     The input f8_ocp_t value.
+ * @return      The converted bhalf_t value.
+ */
+template <>
+inline __host__ __device__ bhalf_t type_convert<bhalf_t, f8_ocp_t>(f8_ocp_t x)
+{
+#if defined(__gfx950__)
+    union
+    {
+        uint16_t i16val;
+        fp8_storage_t i8val[2];
+    } input;
+    input.i8val[0] = x.data;
+
+    union
+    {
+        bhalf2_t bhalf_vec;
+        bhalf_t bhalf_arr[2];
+    } output;
+    output.bhalf_vec = __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8(input.i16val, /*scale*/ 1.f, 0);
+
+    return output.bhalf_arr[0];
+#else
+    return type_convert<bhalf_t>(
+        fp8_impl::cast_from_f8<float, f8_ocp_t::wm, f8_ocp_t::we, false>(x.data));
+#endif
+}
+
+/**
+ * @brief Converts a vector of 2 f8_ocp_t values to a vector of 2 bhalf_t values.
+ *
+ * @param x     The input vector of 2 f8_ocp_t values.
+ * @return      The converted vector of 2 bhalf_t values.
+ */
+template <>
+inline __host__ __device__ bhalf2_t type_convert<bhalf2_t, f8x2_ocp_t>(f8x2_ocp_t x)
+{
+#if defined(__gfx950__)
+    return __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8(bit_cast<uint16_t>(x), /*scale*/ 1.f, 0);
+#else
+    return bhalf2_t{type_convert<bhalf_t>(float(x.AsType<f8_ocp_t>()[Number<0>{}])),
+                    type_convert<bhalf_t>(float(x.AsType<f8_ocp_t>()[Number<1>{}]))};
+#endif
+}
+
+/**
+ * @brief Converts a bf8_ocp_t value to a float value.
+ *
+ * @param x     The input bf8_ocp_t value.
+ * @return      The converted float value.
+ */
+template <>
+inline __host__ __device__ float type_convert<float, bf8_ocp_t>(bf8_ocp_t x)
+{
+#if CK_OCP_FP8_CVT_FAST_PATH
+    union
+    {
+        unsigned int i32val;
+        fp8_storage_t i8val[4];
+    } val;
+    val.i8val[0] = x.data;
+    return __builtin_amdgcn_cvt_f32_bf8(val.i32val, 0);
+#else
+    return fp8_impl::cast_from_f8<float, bf8_ocp_t::wm, bf8_ocp_t::we, false>(x.data);
+#endif
+}
+
+/**
+ * @brief Converts a vector of 2 bf8_ocp_t values to a vector of 2 float values.
+ *
+ * @param x     The input vector of 2 bf8_ocp_t values.
+ * @return      The converted vector of 2 float values.
+ */
+template <>
+inline __host__ __device__ float2_t type_convert<float2_t, bf8x2_ocp_t>(bf8x2_ocp_t x)
+{
+#if CK_OCP_FP8_CVT_FAST_PATH
+    return __builtin_amdgcn_cvt_pk_f32_bf8(bit_cast<uint16_t>(x), false);
+#else
+    return float2_t{fp8_impl::cast_from_f8<float, bf8_ocp_t::wm, bf8_ocp_t::we, false>(
+                        x.AsType<fp8_storage_t>()[Number<0>{}]),
+                    fp8_impl::cast_from_f8<float, bf8_ocp_t::wm, bf8_ocp_t::we, false>(
+                        x.AsType<fp8_storage_t>()[Number<1>{}])};
+#endif
+}
+
+/**
+ * @brief Converts a bf8_ocp_t value to a half_t value.
+ *
+ * @param x     The input bf8_ocp_t value.
+ * @return      The converted half_t value.
+ */
+template <>
+inline __host__ __device__ half_t type_convert<half_t, bf8_ocp_t>(bf8_ocp_t x)
+{
+#if defined(__gfx950__)
+    union
+    {
+        uint16_t i16val;
+        fp8_storage_t i8val[2];
+    } val;
+    val.i8val[0] = x.data;
+    return __builtin_amdgcn_cvt_scalef32_pk_f16_bf8(val.i16val, /*scale*/ 1.f, 0)[0];
+#else
+    return fp8_impl::cast_from_f8<half_t, bf8_ocp_t::wm, bf8_ocp_t::we, false>(x.data);
+#endif
+}
+
+/**
+ * @brief Converts a vector of 2 bf8_ocp_t values to a vector of 2 half_t values.
+ *
+ * @param x     The input vector of 2 bf8_ocp_t values.
+ * @return      The converted vector of 2 half_t values.
+ */
+template <>
+inline __host__ __device__ half2_t type_convert<half2_t, bf8x2_ocp_t>(bf8x2_ocp_t x)
+{
+#if defined(__gfx950__)
+    return __builtin_amdgcn_cvt_scalef32_pk_f16_bf8(bit_cast<uint16_t>(x), /*scale*/ 1.f, 0);
+#else
+    return half2_t{type_convert<half_t>(float(x.AsType<bf8_ocp_t>()[Number<0>{}])),
+                   type_convert<half_t>(float(x.AsType<bf8_ocp_t>()[Number<1>{}]))};
+#endif
+}
+
+/**
+ * @brief Converts a bf8_ocp_t value to a bhalf_t value.
+ *
+ * @param x     The input bf8_ocp_t value.
+ * @return      The converted bhalf_t value.
+ */
+template <>
+inline __host__ __device__ bhalf_t type_convert<bhalf_t, bf8_ocp_t>(bf8_ocp_t x)
+{
+#if defined(__gfx950__)
+    union
+    {
+        uint16_t i16val;
+        fp8_storage_t i8val[2];
+    } input;
+    input.i8val[0] = x.data;
+
+    union
+    {
+        bhalf2_t bhalf_vec;
+        bhalf_t bhalf_arr[2];
+    } output;
+    output.bhalf_vec = __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8(input.i16val, /*scale*/ 1.f, 0);
+
+    return output.bhalf_arr[0];
+#else
+    return type_convert<bhalf_t>(
+        fp8_impl::cast_from_f8<float, bf8_ocp_t::wm, bf8_ocp_t::we, false>(x.data));
+#endif
+}
+
+/**
+ * @brief Converts a vector of 2 bf8_ocp_t values to a vector of 2 bhalf_t values.
+ *
+ * @param x     The input vector of 2 bf8_ocp_t values.
+ * @return      The converted vector of 2 bhalf_t values.
+ */
+template <>
+inline __host__ __device__ bhalf2_t type_convert<bhalf2_t, bf8x2_ocp_t>(bf8x2_ocp_t x)
+{
+#if defined(__gfx950__)
+    return __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8(bit_cast<uint16_t>(x), /*scale*/ 1.f, 0);
+#else
+    return bhalf2_t{type_convert<bhalf_t>(float(x.AsType<bf8_ocp_t>()[Number<0>{}])),
+                    type_convert<bhalf_t>(float(x.AsType<bf8_ocp_t>()[Number<1>{}]))};
+#endif
+}
+
 template <>
 inline __host__ __device__ float2_t type_convert<float2_t, pk_i4_t>(pk_i4_t x)
 {
@@ -610,7 +1189,12 @@ inline __host__ __device__ f8_fnuz_t type_convert<f8_fnuz_t, half_t>(half_t x)
 #endif
 }
 
-// convert fp16 to fp8
+/**
+ * @brief Converts a half_t value to a f8_ocp_t value with rounding determined by a flag.
+ *
+ * @param x     The input half_t value.
+ * @return      The converted f8_ocp_t value.
+ */
 template <>
 inline __host__ __device__ f8_ocp_t type_convert<f8_ocp_t, half_t>(half_t x)
 {
@@ -621,6 +1205,22 @@ inline __host__ __device__ f8_ocp_t type_convert<f8_ocp_t, half_t>(half_t x)
 #endif
 }
 
+/**
+ * @brief Converts a half_t value to a bf8_ocp_t value with rounding determined by a flag.
+ *
+ * @param x     The input half_t value.
+ * @return      The converted bf8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ bf8_ocp_t type_convert<bf8_ocp_t, half_t>(half_t x)
+{
+#if CK_USE_SR_F8_CONVERSION
+    return f8_convert_sr<bf8_ocp_t>(x);
+#else
+    return f8_convert_rne<bf8_ocp_t>(x);
+#endif
+}
+
 // convert fp8 to fp16
 template <>
 inline __host__ __device__ half_t type_convert<half_t, f8_fnuz_t>(f8_fnuz_t x)
@@ -645,7 +1245,28 @@ inline __host__ __device__ bf8_fnuz_t type_convert<bf8_fnuz_t, float>(float x)
 #endif
 }
 
-// convert fp32 to bf8
+/**
+ * @brief Converts a float value to a f8_ocp_t value with rounding determined by a flag.
+ *
+ * @param x     The input float value.
+ * @return      The converted f8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ f8_ocp_t type_convert<f8_ocp_t, float>(float x)
+{
+#if CK_USE_SR_F8_CONVERSION
+    return f8_convert_sr<f8_ocp_t>(x);
+#else
+    return f8_convert_rne<f8_ocp_t>(x);
+#endif
+}
+
+/**
+ * @brief Converts a float value to a bf8_ocp_t value with rounding determined by a flag.
+ *
+ * @param x     The input float value.
+ * @return      The converted bf8_ocp_t value.
+ */
 template <>
 inline __host__ __device__ bf8_ocp_t type_convert<bf8_ocp_t, float>(float x)
 {
@@ -656,6 +1277,38 @@ inline __host__ __device__ bf8_ocp_t type_convert<bf8_ocp_t, float>(float x)
 #endif
 }
 
+/**
+ * @brief Converts a bhalf_t value to a f8_ocp_t value with rounding determined by a flag.
+ *
+ * @param x     The input bhalf_t value.
+ * @return      The converted f8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ f8_ocp_t type_convert<f8_ocp_t, bhalf_t>(bhalf_t x)
+{
+#if CK_USE_SR_F8_CONVERSION
+    return f8_convert_sr<f8_ocp_t>(x);
+#else
+    return f8_convert_rne<f8_ocp_t>(x);
+#endif
+}
+
+/**
+ * @brief Converts a bhalf_t value to a bf8_ocp_t value with rounding determined by a flag.
+ *
+ * @param x     The input bhalf_t value.
+ * @return      The converted bf8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ bf8_ocp_t type_convert<bf8_ocp_t, bhalf_t>(bhalf_t x)
+{
+#if CK_USE_SR_F8_CONVERSION
+    return f8_convert_sr<bf8_ocp_t>(x);
+#else
+    return f8_convert_rne<bf8_ocp_t>(x);
+#endif
+}
+
 // convert bf8 to fp32
 template <>
 inline __host__ __device__ float type_convert<float, bf8_fnuz_t>(bf8_fnuz_t x)
@@ -683,17 +1336,6 @@ inline __host__ __device__ bf8_fnuz_t type_convert<bf8_fnuz_t, half_t>(half_t x)
 #endif
 }
 
-// convert fp16 to bf8
-template <>
-inline __host__ __device__ bf8_ocp_t type_convert<bf8_ocp_t, half_t>(half_t x)
-{
-#if CK_USE_SR_F8_CONVERSION
-    return f8_convert_sr<bf8_ocp_t>(x);
-#else
-    return f8_convert_rne<bf8_ocp_t>(x);
-#endif
-}
-
 // convert bf8 to fp16
 template <>
 inline __host__ __device__ half_t type_convert<half_t, bf8_fnuz_t>(bf8_fnuz_t x)
@@ -706,7 +1348,7 @@ inline __host__ __device__ half_t type_convert<half_t, bf8_fnuz_t>(bf8_fnuz_t x)
     return utils::cast_from_f8<bf8_fnuz_t, half_t, negative_zero_nan>(x);
 #endif
 }
-
+#ifndef CK_CODE_GEN_RTC
 // convert fp32 to fp4 with rounding to nearest even
 inline __host__ __device__ f4_t f4_convert_rne(float x, float scale = 1.0f)
 {
@@ -791,7 +1433,11 @@ inline __host__ __device__ f4x32_t f4_convert_rne(float32_t x, float scale = 1.0
 inline __host__ __device__ f4_t f4_convert_sr(float x, float scale = 1.0f)
 {
     constexpr int seed = 1254739;
-    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+#ifndef CK_CODE_GEN_RTC
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+#else
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
+#endif
 #if defined(__gfx950__)
     union
     {
@@ -816,16 +1462,27 @@ inline __host__ __device__ f4_t f4_convert_sr(float x, float scale = 1.0f)
 inline __host__ __device__ f4x2_t f4_convert_sr(float2_t x, float scale = 1.0f)
 {
     constexpr int seed = 1254739;
-    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x[0]);
+#ifndef CK_CODE_GEN_RTC
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x[0]);
+#else
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x[0]);
+#endif
 #if defined(__gfx950__)
     union
     {
         uint32_t bitwise;
         f4x2_t f4x2_array[4];
     } value{0};
+// apply a temporary workaround for gfx950
+#if CK_WORKAROUND_FP32_TO_FP4_SR_CONVERSION
+    uint8_t l     = utils::sat_convert_to_type_sr<f4_t>(x[1] / scale, rng);
+    uint8_t h     = utils::sat_convert_to_type_sr<f4_t>(x[0] / scale, rng);
+    value.bitwise = (h << 4) | l;
+#else
     // permute high bits and low bits to match the order of the original vector
     value.bitwise = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32(
         value.bitwise, float2_t{x[1], x[0]}, rng, scale, 0);
+#endif // CK_WORKAROUND_FP32_TO_FP4_SR_CONVERSION
     return value.f4x2_array[0];
 #else
     union
@@ -844,7 +1501,11 @@ inline __host__ __device__ f4x2_t f4_convert_sr(float2_t x, float scale = 1.0f)
 inline __host__ __device__ f4x32_t f4_convert_sr(float32_t x, float scale = 1.0f)
 {
     constexpr int seed = 1254739;
-    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x[0]);
+#ifndef CK_CODE_GEN_RTC
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x[0]);
+#else
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x[0]);
+#endif
 #if defined(__gfx950__)
     union
     {
@@ -1097,7 +1758,11 @@ inline __host__ __device__ f6x32_t f6_convert_rne(float32_t x, float scale = 1.0
 inline __host__ __device__ f6_t f6_convert_sr(float x, float scale = 1.0f)
 {
     constexpr int seed = 1254739;
-    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+#ifndef CK_CODE_GEN_RTC
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+#else
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
+#endif
 #if defined(__gfx950__)
     union
     {
@@ -1137,8 +1802,13 @@ inline __host__ __device__ f6x32_t f6_convert_sr(float32_t x, float scale = 1.0f
         float32_t float_vector;
         float float_array[32];
     } float_values{x};
+#ifndef CK_CODE_GEN_RTC
     uint32_t rng =
         prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), float_values.float_array[0]);
+#else
+    uint32_t rng =
+        prand_generator<float, seed>(reinterpret_cast<size_t>(&x), float_values.float_array[0]);
+#endif
 #if defined(__gfx950__)
     return __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32(x, rng, scale);
 #else
@@ -1356,7 +2026,11 @@ inline __host__ __device__ bf6x32_t bf6_convert_rne(float32_t x, float scale = 1
 inline __host__ __device__ bf6_t bf6_convert_sr(float x, float scale = 1.0f)
 {
     constexpr int seed = 1254739;
-    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+#ifndef CK_CODE_GEN_RTC
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+#else
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
+#endif
 #if defined(__gfx950__)
     union
     {
@@ -1398,8 +2072,13 @@ inline __host__ __device__ bf6x32_t bf6_convert_sr(float32_t x, float scale = 1.
         float32_t float_vector;
         float float_array[32];
     } float_values{x};
+#ifndef CK_CODE_GEN_RTC
     uint32_t rng =
         prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), float_values.float_array[0]);
+#else
+    uint32_t rng =
+        prand_generator<float, seed>(reinterpret_cast<size_t>(&x), float_values.float_array[0]);
+#endif
 #if defined(__gfx950__)
     return __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32(x, rng, scale);
 #else
@@ -1531,7 +2210,7 @@ inline __host__ __device__ float32_t type_convert<float32_t, bf6x32_t>(bf6x32_t
     return out.float_vector;
 #endif
 }
-
+#endif
 #if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 template <typename Y, typename X, size_t NumElems>
 inline __host__ __device__ void array_convert(std::array<Y, NumElems>& y,
diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp
index a8c95b9c38..821b3a8e84 100644
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -9,6 +9,7 @@
 #include "ck_tile/core/algorithm/space_filling_curve.hpp"
 #include "ck_tile/core/algorithm/static_encoding_pattern.hpp"
 #include "ck_tile/core/arch/amd_buffer_addressing.hpp"
+#include "ck_tile/core/arch/amd_buffer_addressing_builtins.hpp"
 #include "ck_tile/core/arch/arch.hpp"
 #include "ck_tile/core/arch/generic_memory_space_atomic.hpp"
 #include "ck_tile/core/arch/utility.hpp"
@@ -58,6 +59,7 @@
 #include "ck_tile/core/tensor/transpose_tile.hpp"
 #include "ck_tile/core/tensor/update_tile.hpp"
 #include "ck_tile/core/utility/bit_cast.hpp"
+#include "ck_tile/core/utility/env.hpp"
 #include "ck_tile/core/utility/functional.hpp"
 #include "ck_tile/core/utility/functional_with_tuple.hpp"
 #include "ck_tile/core/utility/ignore.hpp"
diff --git a/include/ck_tile/core/algorithm/static_encoding_pattern.hpp b/include/ck_tile/core/algorithm/static_encoding_pattern.hpp
index 78884f3f9f..b56bda3741 100644
--- a/include/ck_tile/core/algorithm/static_encoding_pattern.hpp
+++ b/include/ck_tile/core/algorithm/static_encoding_pattern.hpp
@@ -73,10 +73,11 @@ struct TileDistributionEncodingPattern2D<BlockSize,
 
     // TODO: make pattern where below condition does not need to hold - GGemmMultiDSplitk!
     static_assert(XPerTile % VecSize == 0, "XPerTile must be a multiple of VecSize!");
-    static constexpr index_t warp_size = get_warp_size();
-    static constexpr index_t num_warps = BlockSize / get_warp_size();
-    static constexpr index_t X1        = VecSize;
-    static constexpr index_t X0        = XPerTile / X1; // # of threads in X dim
+    static constexpr index_t warp_size  = get_warp_size();
+    static constexpr index_t num_warps  = BlockSize / get_warp_size();
+    static constexpr index_t LargestVec = (XPerTile * YPerTile) / (num_warps * warp_size);
+    static constexpr index_t X1         = VecSize > LargestVec ? LargestVec : VecSize;
+    static constexpr index_t X0         = XPerTile / X1; // # of threads in X dim
 
     // # of rows in Y dim accessed by single wavefront in one iteration
     static constexpr index_t Y1 = warp_size / X0;
@@ -124,10 +125,11 @@ struct TileDistributionEncodingPattern2D<BlockSize,
 {
 
     static_assert(XPerTile % VecSize == 0, "XPerTile must be a multiple of VecSize!");
-    static constexpr index_t warp_size = get_warp_size();
-    static constexpr index_t num_warps = BlockSize / get_warp_size();
-    static constexpr index_t X1        = VecSize;
-    static constexpr index_t X0        = XPerTile / X1; // # of threads in X dim
+    static constexpr index_t warp_size  = get_warp_size();
+    static constexpr index_t num_warps  = BlockSize / get_warp_size();
+    static constexpr index_t LargestVec = (XPerTile * YPerTile) / (num_warps * warp_size);
+    static constexpr index_t X1         = VecSize > LargestVec ? LargestVec : VecSize;
+    static constexpr index_t X0         = XPerTile / X1; // # of threads in X dim
 
     static constexpr index_t Y2 = warp_size / X0; // # of rows in Y dim to cover whole wavefront
     static_assert(X0 * Y2 == warp_size, "X0 * Y2 must cover whole wavefront!");
@@ -173,10 +175,11 @@ struct TileDistributionEncodingPattern2D<BlockSize,
 
     // TODO: make pattern where below condition does not need to hold - GGemmMultiDSplitk!
     static_assert(XPerTile % VecSize == 0, "XPerTile must be a multiple of VecSize!");
-    static constexpr index_t warp_size = get_warp_size();
-    static constexpr index_t num_warps = BlockSize / get_warp_size();
-    static constexpr index_t X1        = VecSize;
-    static constexpr index_t X0        = XPerTile / X1; // # of threads in X dim
+    static constexpr index_t warp_size  = get_warp_size();
+    static constexpr index_t num_warps  = BlockSize / get_warp_size();
+    static constexpr index_t LargestVec = (XPerTile * YPerTile) / (num_warps * warp_size);
+    static constexpr index_t X1         = VecSize > LargestVec ? LargestVec : VecSize;
+    static constexpr index_t X0         = XPerTile / X1; // # of threads in X dim
     static constexpr index_t Y2 = warp_size / X0; // # of rows in Y dim to cover whole wavefront
     static_assert(X0 * Y2 == warp_size, "X0 * Y2 must cover whole wavefront!");
     static constexpr index_t Y1 = num_warps;
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index 4e0deb1547..5d6d6ce348 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#if !CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN
+
 #include "ck_tile/core/numeric/integer.hpp"
 #include "ck_tile/core/numeric/integral_constant.hpp"
 #include "ck_tile/core/numeric/vector_type.hpp"
@@ -12,6 +14,15 @@
 #include "ck_tile/core/utility/bit_cast.hpp"
 #include "ck_tile/core/utility/functional.hpp"
 
+// This attribute gives a hint to the compiler that a branch is likely to be taken.
+// Then, the compiler should remove if possible the associated s_cbranch_execz branch that would
+// have been generated.
+#if __cplusplus >= 202002L
+#define LIKELY(x) (x) [[likely]]
+#else
+#define LIKELY(x) (__builtin_expect(!!(x), 1))
+#endif
+
 namespace ck_tile {
 
 // 128 bit SGPRs to supply buffer resource in buffer instructions
@@ -56,10 +67,36 @@ template<> struct buffer_load_trait<4 , thread_buffer<bf16_t, 2>> { using payloa
 // TODO: glc/slc/...
 template <index_t bytes, bool pre_nop = false>
 struct buffer_load;
+
+template <index_t bytes, bool pre_nop = false>
+struct buffer_load_if;
+
+template <index_t bytes>
+struct buffer_store;
+
+template <index_t bytes>
+struct buffer_store_if;
+
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
 // TODO: strict aliasing rule seems fail when reinterpret_cast between vector type
 // (exp_vector_type(xxx))
+
+#define HAS_RAW_BUFFER_BUILTINS                             \
+    __has_builtin(__builtin_amdgcn_raw_buffer_load_b32) &&  \
+        __has_builtin(__builtin_amdgcn_make_buffer_rsrc) && \
+        __has_builtin(__builtin_amdgcn_raw_buffer_store_b32)
+
+#if HAS_RAW_BUFFER_BUILTINS
+CK_TILE_DEVICE __amdgpu_buffer_rsrc_t cast_to_amdgpu_buffer_rsrc_t(int32x4_t res)
+{
+    __amdgpu_buffer_rsrc_t as_rsrc;
+    static_assert(sizeof(res) == sizeof(as_rsrc) && "Size of buffer resource should match");
+    memcpy(&as_rsrc, &res, sizeof(res));
+    return as_rsrc;
+}
+#endif
+
 template <bool pre_nop>
 struct buffer_load<16, pre_nop>
 {
@@ -74,6 +111,11 @@ struct buffer_load<16, pre_nop>
     {
         static_assert(sizeof(T) == 16);
         using mbuf_t = typename impl::buffer_load_trait<16, T>::payload_t;
+#if HAS_RAW_BUFFER_BUILTINS
+        index_t s_offset                 = i_offset;
+        reinterpret_cast<mbuf_t&>(value) = __builtin_amdgcn_raw_buffer_load_b128(
+            cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
+#else
         if constexpr(pre_nop)
             asm volatile("s_nop 4\n"
                          "buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3"
@@ -85,6 +127,7 @@ struct buffer_load<16, pre_nop>
                          : "+v"(reinterpret_cast<mbuf_t&>(value))
                          : "v"(v_offset), "s"(res), "n"(i_offset)
                          : "memory");
+#endif
     }
 };
 
@@ -102,6 +145,11 @@ struct buffer_load<8, pre_nop>
     {
         static_assert(sizeof(T) == 8);
         using mbuf_t = typename impl::buffer_load_trait<8, T>::payload_t;
+#if HAS_RAW_BUFFER_BUILTINS
+        index_t s_offset                 = i_offset;
+        reinterpret_cast<mbuf_t&>(value) = __builtin_amdgcn_raw_buffer_load_b64(
+            cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
+#else
         if constexpr(pre_nop)
             asm volatile("s_nop 4\n"
                          "buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3"
@@ -113,6 +161,7 @@ struct buffer_load<8, pre_nop>
                          : "+v"(reinterpret_cast<mbuf_t&>(value))
                          : "v"(v_offset), "s"(res), "n"(i_offset)
                          : "memory");
+#endif
     }
 };
 
@@ -130,6 +179,12 @@ struct buffer_load<4, pre_nop>
     {
         static_assert(sizeof(T) == 4);
         using mbuf_t = typename impl::buffer_load_trait<4, T>::payload_t;
+
+#if HAS_RAW_BUFFER_BUILTINS
+        index_t s_offset                 = i_offset;
+        reinterpret_cast<mbuf_t&>(value) = __builtin_amdgcn_raw_buffer_load_b32(
+            cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
+#else
         if constexpr(pre_nop)
             asm volatile("s_nop 4\n"
                          "buffer_load_dword %0, %1, %2, 0 offen offset:%3"
@@ -141,6 +196,7 @@ struct buffer_load<4, pre_nop>
                          : "+v"(reinterpret_cast<mbuf_t&>(value))
                          : "v"(v_offset), "s"(res), "n"(i_offset)
                          : "memory");
+#endif
     }
 };
 
@@ -158,6 +214,12 @@ struct buffer_load<2, pre_nop>
     {
         static_assert(sizeof(T) == 4); // subdword is buggy, use dword buf and convert manually
         using mbuf_t = typename impl::buffer_load_trait<2, T>::payload_t;
+
+#if HAS_RAW_BUFFER_BUILTINS
+        index_t s_offset                 = i_offset;
+        reinterpret_cast<mbuf_t&>(value) = __builtin_amdgcn_raw_buffer_load_b16(
+            cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
+#else
         if constexpr(pre_nop)
             asm volatile("s_nop 4\n"
                          "buffer_load_ushort %0, %1, %2, 0 offen offset:%3"
@@ -169,6 +231,7 @@ struct buffer_load<2, pre_nop>
                          : "+v"(reinterpret_cast<mbuf_t&>(value))
                          : "v"(v_offset), "s"(res), "n"(i_offset)
                          : "memory");
+#endif
     }
 };
 
@@ -186,6 +249,11 @@ struct buffer_load<1, pre_nop>
     {
         static_assert(sizeof(T) == 4);
         using mbuf_t = typename impl::buffer_load_trait<1, T>::payload_t;
+#if HAS_RAW_BUFFER_BUILTINS
+        index_t s_offset                 = i_offset;
+        reinterpret_cast<mbuf_t&>(value) = __builtin_amdgcn_raw_buffer_load_b16(
+            cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
+#else
         if constexpr(pre_nop)
             asm volatile("s_nop 4\n"
                          "buffer_load_ubyte %0, %1, %2, 0 offen offset:%3"
@@ -197,12 +265,31 @@ struct buffer_load<1, pre_nop>
                          : "+v"(reinterpret_cast<mbuf_t&>(value))
                          : "v"(v_offset), "s"(res), "n"(i_offset)
                          : "memory");
+#endif
     }
 };
 
-template <index_t bytes, bool pre_nop = false>
-struct buffer_load_if;
-
+#if HAS_RAW_BUFFER_BUILTINS
+template <index_t bytes, bool pre_nop>
+struct buffer_load_if
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t s_offset,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t flag           = 0,
+                                   bool_constant<pre_nop> = {})
+    {
+        if LIKELY(1 <= flag)
+        {
+            buffer_load<bytes, pre_nop>{}(
+                value, res, v_offset, s_offset, i_offset, flag, bool_constant<pre_nop>{});
+        }
+    }
+};
+#else
 template <bool pre_nop>
 struct buffer_load_if<16, pre_nop>
 {
@@ -212,12 +299,12 @@ struct buffer_load_if<16, pre_nop>
                                    index_t v_offset,
                                    index_t /*s_offset*/,
                                    index_t i_offset /*max 0xFFF*/,
-                                   index_t flag           = 0,
+                                   index_t flag = 0,
                                    bool_constant<pre_nop> = {})
     {
         static_assert(sizeof(T) == 16);
         auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t    = typename impl::buffer_load_trait<16, T>::payload_t;
+        using mbuf_t = typename impl::buffer_load_trait<16, T>::payload_t;
         static_assert(sizeof(mbuf_t) == sizeof(T));
         if constexpr(pre_nop)
             asm volatile("s_nop 4\n"
@@ -246,12 +333,12 @@ struct buffer_load_if<8, pre_nop>
                                    index_t v_offset,
                                    index_t /*s_offset*/,
                                    index_t i_offset /*max 0xFFF*/,
-                                   index_t flag           = 0,
+                                   index_t flag = 0,
                                    bool_constant<pre_nop> = {})
     {
         static_assert(sizeof(T) == 8);
         auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t    = typename impl::buffer_load_trait<8, T>::payload_t;
+        using mbuf_t = typename impl::buffer_load_trait<8, T>::payload_t;
         if constexpr(pre_nop)
             asm volatile("s_nop 4\n"
                          "v_cmpx_le_u32 exec, 1, %4\n"
@@ -279,12 +366,12 @@ struct buffer_load_if<4, pre_nop>
                                    index_t v_offset,
                                    index_t /*s_offset*/,
                                    index_t i_offset /*max 0xFFF*/,
-                                   index_t flag           = 0,
+                                   index_t flag = 0,
                                    bool_constant<pre_nop> = {})
     {
         static_assert(sizeof(T) == 4);
         auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t    = typename impl::buffer_load_trait<4, T>::payload_t;
+        using mbuf_t = typename impl::buffer_load_trait<4, T>::payload_t;
         if constexpr(pre_nop)
             asm volatile("s_nop 4\n"
                          "v_cmpx_le_u32 exec, 1, %4\n"
@@ -312,12 +399,12 @@ struct buffer_load_if<2, pre_nop>
                                    index_t v_offset,
                                    index_t /*s_offset*/,
                                    index_t i_offset /*max 0xFFF*/,
-                                   index_t flag           = 0,
+                                   index_t flag = 0,
                                    bool_constant<pre_nop> = {})
     {
         static_assert(sizeof(T) == 4);
         auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t    = typename impl::buffer_load_trait<2, T>::payload_t;
+        using mbuf_t = typename impl::buffer_load_trait<2, T>::payload_t;
         if constexpr(pre_nop)
             asm volatile("s_nop 4\n"
                          "v_cmpx_le_u32 exec, 1, %4\n"
@@ -345,12 +432,12 @@ struct buffer_load_if<1, pre_nop>
                                    index_t v_offset,
                                    index_t /*s_offset*/,
                                    index_t i_offset /*max 0xFFF*/,
-                                   index_t flag           = 0,
+                                   index_t flag = 0,
                                    bool_constant<pre_nop> = {})
     {
         static_assert(sizeof(T) == 4);
         auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t    = typename impl::buffer_load_trait<1, T>::payload_t;
+        using mbuf_t = typename impl::buffer_load_trait<1, T>::payload_t;
         if constexpr(pre_nop)
             asm volatile("s_nop 4\n"
                          "v_cmpx_le_u32 exec, 1, %4\n"
@@ -368,9 +455,9 @@ struct buffer_load_if<1, pre_nop>
                          : "memory");
     }
 };
+#endif
+
 #pragma clang diagnostic pop // "-Wundefined-reinterpret-cast"
-template <index_t bytes>
-struct buffer_store;
 
 template <>
 struct buffer_store<16>
@@ -385,10 +472,16 @@ struct buffer_store<16>
     {
         static_assert(sizeof(T) == 16);
         using mbuf_t = fp32x4_t;
+#if HAS_RAW_BUFFER_BUILTINS
+        index_t s_offset = i_offset;
+        __builtin_amdgcn_raw_buffer_store_b128(
+            bit_cast<mbuf_t>(value), cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
+#else
         asm volatile("buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3"
                      :
                      : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
                      : "memory");
+#endif
     }
 };
 
@@ -405,10 +498,16 @@ struct buffer_store<8>
     {
         static_assert(sizeof(T) == 8);
         using mbuf_t = fp32x2_t;
+#if HAS_RAW_BUFFER_BUILTINS
+        index_t s_offset = i_offset;
+        __builtin_amdgcn_raw_buffer_store_b64(
+            bit_cast<mbuf_t>(value), cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
+#else
         asm volatile("buffer_store_dwordx2 %0, %1, %2, 0 offen offset:%3"
                      :
                      : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
                      : "memory");
+#endif
     }
 };
 
@@ -425,10 +524,16 @@ struct buffer_store<4>
     {
         static_assert(sizeof(T) == 4);
         using mbuf_t = float;
+#if HAS_RAW_BUFFER_BUILTINS
+        index_t s_offset = i_offset;
+        __builtin_amdgcn_raw_buffer_store_b32(
+            bit_cast<mbuf_t>(value), cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
+#else
         asm volatile("buffer_store_dword %0, %1, %2, 0 offen offset:%3"
                      :
                      : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
                      : "memory");
+#endif
     }
 };
 
@@ -445,10 +550,16 @@ struct buffer_store<2>
     {
         static_assert(sizeof(T) == 2);
         using mbuf_t = short;
+#if HAS_RAW_BUFFER_BUILTINS
+        index_t s_offset = i_offset;
+        __builtin_amdgcn_raw_buffer_store_b16(
+            bit_cast<mbuf_t>(value), cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
+#else
         asm volatile("buffer_store_short %0, %1, %2, 0 offen offset:%3"
                      :
                      : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
                      : "memory");
+#endif
     }
 };
 
@@ -465,16 +576,38 @@ struct buffer_store<1>
     {
         static_assert(sizeof(T) == 4);
         using mbuf_t = float;
+#if HAS_RAW_BUFFER_BUILTINS
+        index_t s_offset = i_offset;
+        __builtin_amdgcn_raw_buffer_store_b8(
+            bit_cast<mbuf_t>(value), cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
+#else
         asm volatile("buffer_store_byte %0, %1, %2, 0 offen offset:%3"
                      :
                      : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
                      : "memory");
+#endif
     }
 };
 
+#if HAS_RAW_BUFFER_BUILTINS
 template <index_t bytes>
-struct buffer_store_if;
-
+struct buffer_store_if
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t s_offset,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t flag = 1)
+    {
+        if LIKELY(1 <= flag)
+        {
+            buffer_store<bytes>{}(value, res, v_offset, s_offset, i_offset);
+        }
+    }
+};
+#else
 template <>
 struct buffer_store_if<16>
 {
@@ -488,7 +621,7 @@ struct buffer_store_if<16>
     {
         static_assert(sizeof(T) == 16);
         auto save_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t   = fp32x4_t;
+        using mbuf_t = fp32x4_t;
         asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
                      "buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
                      "s_mov_b64 exec %5"
@@ -545,7 +678,7 @@ struct buffer_store_if<4>
     {
         static_assert(sizeof(T) == 4);
         auto save_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t   = float;
+        using mbuf_t = float;
         asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
                      "buffer_store_dword %0, %1, %2, 0 offen offset:%3\n"
                      "s_mov_b64 exec %5"
@@ -573,7 +706,7 @@ struct buffer_store_if<2>
     {
         static_assert(sizeof(T) == 2);
         auto save_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t   = short;
+        using mbuf_t = short;
         asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
                      "buffer_store_short %0, %1, %2, 0 offen offset:%3\n"
                      "s_mov_b64 exec %5"
@@ -601,7 +734,7 @@ struct buffer_store_if<1>
     {
         static_assert(sizeof(T) == 4);
         auto save_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t   = float;
+        using mbuf_t = float;
         asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
                      "buffer_store_byte %0, %1, %2, 0 offen offset:%3\n"
                      "s_mov_b64 exec %5"
@@ -615,6 +748,7 @@ struct buffer_store_if<1>
                      : "memory");
     }
 };
+#endif
 
 CK_TILE_DEVICE void buffer_load_fence(index_t cnt = 0)
 {
@@ -1551,7 +1685,7 @@ CK_TILE_DEVICE void amd_async_buffer_load(CK_TILE_LDS_ADDR T* smem,
 
     if constexpr(oob_conditional_check)
     {
-        index_t v_offset = flag ? v_offset : src_wave_buffer_resource[2];
+        index_t v_offset = flag ? src_thread_addr_offset : src_wave_buffer_resource[2];
         llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource,
                                         smem,
                                         sizeof(uint32_t),
@@ -2553,3 +2687,5 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
 }
 
 } // namespace ck_tile
+
+#endif // !CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
new file mode 100644
index 0000000000..0b9956cd01
--- /dev/null
+++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
@@ -0,0 +1,2559 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#if CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN
+
+#include "ck_tile/core/numeric/integer.hpp"
+#include "ck_tile/core/numeric/integral_constant.hpp"
+#include "ck_tile/core/numeric/vector_type.hpp"
+#include "ck_tile/core/container/container_helper.hpp"
+#include "ck_tile/core/container/thread_buffer.hpp"
+#include "ck_tile/core/utility/type_traits.hpp"
+#include "ck_tile/core/utility/bit_cast.hpp"
+#include "ck_tile/core/utility/functional.hpp"
+
+namespace ck_tile {
+
+// 128 bit SGPRs to supply buffer resource in buffer instructions
+// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
+struct __attribute__((packed)) buffer_resource
+{
+    const void* ptr;
+    uint32_t range;
+    uint32_t config;
+};
+
+CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t size = 0xffffffff)
+{
+    buffer_resource res{ptr, size, CK_TILE_BUFFER_RESOURCE_3RD_DWORD};
+    int32x4_t r = __builtin_bit_cast(int32x4_t, res);
+    r.x         = __builtin_amdgcn_readfirstlane(r.x);
+    r.y         = __builtin_amdgcn_readfirstlane(r.y);
+    r.z         = __builtin_amdgcn_readfirstlane(r.z);
+    r.w         = __builtin_amdgcn_readfirstlane(r.w);
+    return r;
+}
+
+namespace impl {
+// below type indicate the data type used for buffer load inline asm
+// clang-format off
+template<index_t N, typename T> struct buffer_load_trait;
+
+template<typename T> struct buffer_load_trait<16, T> { using payload_t = fp32x4_t; };
+template<typename T> struct buffer_load_trait<8 , T> { using payload_t = fp32x2_t; };
+template<typename T> struct buffer_load_trait<4 , T> { using payload_t = float; };
+template<typename T> struct buffer_load_trait<2 , T> { using payload_t = float; };
+template<typename T> struct buffer_load_trait<1 , T> { using payload_t = float; };
+
+#if CK_TILE_BUFFER_LOAD_RAW_BF16_WA
+template<> struct buffer_load_trait<16, thread_buffer<bf16_t, 8>> { using payload_t = bf16x8_t; };
+template<> struct buffer_load_trait<8 , thread_buffer<bf16_t, 4>> { using payload_t = bf16x4_t; };
+template<> struct buffer_load_trait<4 , thread_buffer<bf16_t, 2>> { using payload_t = bf16x2_t; };
+#endif
+// clang-format on
+} // namespace impl
+
+// TODO: glc/slc/...
+template <index_t bytes, bool pre_nop = false>
+struct buffer_load;
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
+// TODO: strict aliasing rule seems fail when reinterpret_cast between vector type
+// (exp_vector_type(xxx))
+template <bool pre_nop>
+struct buffer_load<16, pre_nop>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t /*flag*/       = 0,
+                                   bool_constant<pre_nop> = {})
+    {
+        static_assert(sizeof(T) == 16);
+        using mbuf_t = typename impl::buffer_load_trait<16, T>::payload_t;
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
+        else
+            asm volatile("buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
+    }
+};
+
+template <bool pre_nop>
+struct buffer_load<8, pre_nop>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t /*flag*/       = 0,
+                                   bool_constant<pre_nop> = {})
+    {
+        static_assert(sizeof(T) == 8);
+        using mbuf_t = typename impl::buffer_load_trait<8, T>::payload_t;
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
+        else
+            asm volatile("buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
+    }
+};
+
+template <bool pre_nop>
+struct buffer_load<4, pre_nop>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t /*flag*/       = 0,
+                                   bool_constant<pre_nop> = {})
+    {
+        static_assert(sizeof(T) == 4);
+        using mbuf_t = typename impl::buffer_load_trait<4, T>::payload_t;
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "buffer_load_dword %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
+        else
+            asm volatile("buffer_load_dword %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
+    }
+};
+
+template <bool pre_nop>
+struct buffer_load<2, pre_nop>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t /*flag*/       = 0,
+                                   bool_constant<pre_nop> = {})
+    {
+        static_assert(sizeof(T) == 4); // subdword is buggy, use dword buf and convert manually
+        using mbuf_t = typename impl::buffer_load_trait<2, T>::payload_t;
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "buffer_load_ushort %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
+        else
+            asm volatile("buffer_load_ushort %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
+    }
+};
+
+template <bool pre_nop>
+struct buffer_load<1, pre_nop>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t /*flag*/       = 0,
+                                   bool_constant<pre_nop> = {})
+    {
+        static_assert(sizeof(T) == 4);
+        using mbuf_t = typename impl::buffer_load_trait<1, T>::payload_t;
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "buffer_load_ubyte %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
+        else
+            asm volatile("buffer_load_ubyte %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
+    }
+};
+
+template <index_t bytes, bool pre_nop = false>
+struct buffer_load_if;
+
+template <bool pre_nop>
+struct buffer_load_if<16, pre_nop>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t flag           = 0,
+                                   bool_constant<pre_nop> = {})
+    {
+        static_assert(sizeof(T) == 16);
+        auto saved_exec = __builtin_amdgcn_read_exec();
+        using mbuf_t    = typename impl::buffer_load_trait<16, T>::payload_t;
+        static_assert(sizeof(mbuf_t) == sizeof(T));
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
+        else
+            asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
+    }
+};
+
+template <bool pre_nop>
+struct buffer_load_if<8, pre_nop>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t flag           = 0,
+                                   bool_constant<pre_nop> = {})
+    {
+        static_assert(sizeof(T) == 8);
+        auto saved_exec = __builtin_amdgcn_read_exec();
+        using mbuf_t    = typename impl::buffer_load_trait<8, T>::payload_t;
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
+        else
+            asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
+    }
+};
+
+template <bool pre_nop>
+struct buffer_load_if<4, pre_nop>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t flag           = 0,
+                                   bool_constant<pre_nop> = {})
+    {
+        static_assert(sizeof(T) == 4);
+        auto saved_exec = __builtin_amdgcn_read_exec();
+        using mbuf_t    = typename impl::buffer_load_trait<4, T>::payload_t;
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_dword %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
+        else
+            asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_dword %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
+    }
+};
+
+template <bool pre_nop>
+struct buffer_load_if<2, pre_nop>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t flag           = 0,
+                                   bool_constant<pre_nop> = {})
+    {
+        static_assert(sizeof(T) == 4);
+        auto saved_exec = __builtin_amdgcn_read_exec();
+        using mbuf_t    = typename impl::buffer_load_trait<2, T>::payload_t;
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_ushort %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
+        else
+            asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_ushort %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
+    }
+};
+
+template <bool pre_nop>
+struct buffer_load_if<1, pre_nop>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t flag           = 0,
+                                   bool_constant<pre_nop> = {})
+    {
+        static_assert(sizeof(T) == 4);
+        auto saved_exec = __builtin_amdgcn_read_exec();
+        using mbuf_t    = typename impl::buffer_load_trait<1, T>::payload_t;
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_ubyte %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
+        else
+            asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_ubyte %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
+    }
+};
+#pragma clang diagnostic pop // "-Wundefined-reinterpret-cast"
+template <index_t bytes>
+struct buffer_store;
+
+template <>
+struct buffer_store<16>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t /*flag*/ = 1)
+    {
+        static_assert(sizeof(T) == 16);
+        using mbuf_t = fp32x4_t;
+        asm volatile("buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3"
+                     :
+                     : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
+                     : "memory");
+    }
+};
+
+template <>
+struct buffer_store<8>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t /*flag*/ = 1)
+    {
+        static_assert(sizeof(T) == 8);
+        using mbuf_t = fp32x2_t;
+        asm volatile("buffer_store_dwordx2 %0, %1, %2, 0 offen offset:%3"
+                     :
+                     : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
+                     : "memory");
+    }
+};
+
+template <>
+struct buffer_store<4>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t /*flag*/ = 1)
+    {
+        static_assert(sizeof(T) == 4);
+        using mbuf_t = float;
+        asm volatile("buffer_store_dword %0, %1, %2, 0 offen offset:%3"
+                     :
+                     : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
+                     : "memory");
+    }
+};
+
+template <>
+struct buffer_store<2>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t /*flag*/ = 1)
+    {
+        static_assert(sizeof(T) == 2);
+        using mbuf_t = short;
+        asm volatile("buffer_store_short %0, %1, %2, 0 offen offset:%3"
+                     :
+                     : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
+                     : "memory");
+    }
+};
+
+template <>
+struct buffer_store<1>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t /*flag*/ = 1)
+    {
+        static_assert(sizeof(T) == 4);
+        using mbuf_t = float;
+        asm volatile("buffer_store_byte %0, %1, %2, 0 offen offset:%3"
+                     :
+                     : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
+                     : "memory");
+    }
+};
+
+template <index_t bytes>
+struct buffer_store_if;
+
+template <>
+struct buffer_store_if<16>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t flag = 1)
+    {
+        static_assert(sizeof(T) == 16);
+        auto save_exec = __builtin_amdgcn_read_exec();
+        using mbuf_t   = fp32x4_t;
+        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                     "buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
+                     "s_mov_b64 exec %5"
+                     :
+                     : "v"(bit_cast<mbuf_t>(value)),
+                       "v"(v_offset),
+                       "s"(res),
+                       "n"(i_offset),
+                       "v"(flag),
+                       "s"(save_exec)
+                     : "memory");
+    }
+};
+
+template <>
+struct buffer_store_if<8>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t flag = 1)
+    {
+        static_assert(sizeof(T) == 8);
+        auto save_exec = __builtin_amdgcn_read_exec();
+        // TODO: ugly. rocm-6.0/6.1 seems neet bit_cast to same base type to avoid scratch
+        using mbuf_t = ext_vector_t<typename T::value_type, T::size()>;
+        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                     "buffer_store_dwordx2 %0, %1, %2, 0 offen offset:%3\n"
+                     "s_mov_b64 exec %5"
+                     :
+                     : "v"(bit_cast<mbuf_t>(value)),
+                       "v"(v_offset),
+                       "s"(res),
+                       "n"(i_offset),
+                       "v"(flag),
+                       "s"(save_exec)
+                     : "memory");
+    }
+};
+
+template <>
+struct buffer_store_if<4>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t flag = 1)
+    {
+        static_assert(sizeof(T) == 4);
+        auto save_exec = __builtin_amdgcn_read_exec();
+        using mbuf_t   = float;
+        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                     "buffer_store_dword %0, %1, %2, 0 offen offset:%3\n"
+                     "s_mov_b64 exec %5"
+                     :
+                     : "v"(bit_cast<mbuf_t>(value)),
+                       "v"(v_offset),
+                       "s"(res),
+                       "n"(i_offset),
+                       "v"(flag),
+                       "s"(save_exec)
+                     : "memory");
+    }
+};
+
+template <>
+struct buffer_store_if<2>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t flag = 1)
+    {
+        static_assert(sizeof(T) == 2);
+        auto save_exec = __builtin_amdgcn_read_exec();
+        using mbuf_t   = short;
+        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                     "buffer_store_short %0, %1, %2, 0 offen offset:%3\n"
+                     "s_mov_b64 exec %5"
+                     :
+                     : "v"(bit_cast<mbuf_t>(value)),
+                       "v"(v_offset),
+                       "s"(res),
+                       "n"(i_offset),
+                       "v"(flag),
+                       "s"(save_exec)
+                     : "memory");
+    }
+};
+
+template <>
+struct buffer_store_if<1>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t flag = 1)
+    {
+        static_assert(sizeof(T) == 4);
+        auto save_exec = __builtin_amdgcn_read_exec();
+        using mbuf_t   = float;
+        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                     "buffer_store_byte %0, %1, %2, 0 offen offset:%3\n"
+                     "s_mov_b64 exec %5"
+                     :
+                     : "v"(bit_cast<mbuf_t>(value)),
+                       "v"(v_offset),
+                       "s"(res),
+                       "n"(i_offset),
+                       "v"(flag),
+                       "s"(save_exec)
+                     : "memory");
+    }
+};
+
+CK_TILE_DEVICE void buffer_load_fence(index_t cnt = 0)
+{
+    asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
+}
+
+CK_TILE_DEVICE void lds_load_fence(index_t cnt = 0)
+{
+    asm volatile("s_waitcnt lgkmcnt(%0)" : : "n"(cnt) : "memory");
+}
+
+template <typename scalar_type, index_t N, bool pre_nop = false>
+struct buffer_atomic_add_if;
+
+template <bool pre_nop>
+struct buffer_atomic_add_if<bf16_t, 2, pre_nop>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t flag = 1)
+    {
+        static_assert(sizeof(T) == 4);
+        auto save_exec = __builtin_amdgcn_read_exec();
+        using mbuf_t   = float;
+        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                     "global_atomic_pk_add_bf16 %0, %1, %2 offset:%3\n"
+                     "s_mov_b64 exec %5"
+                     :
+                     : "v"(v_offset),
+                       "v"(bit_cast<mbuf_t>(value)),
+                       "s"(res.xy),
+                       "n"(i_offset),
+                       "v"(flag),
+                       "s"(save_exec)
+                     : "memory");
+    }
+};
+
+template <typename scalar_type, index_t N, bool pre_nop = false>
+struct buffer_atomic_add;
+
+template <bool pre_nop>
+struct buffer_atomic_add<bf16_t, 2, pre_nop>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t /*flag = 1*/)
+    {
+        static_assert(sizeof(T) == 4);
+        using mbuf_t = float;
+        asm volatile("global_atomic_pk_add_bf16 %0, %1, %2 offset:%3"
+                     :
+                     : "v"(v_offset), "v"(bit_cast<mbuf_t>(value)), "s"(res.xy), "n"(i_offset)
+                     : "memory");
+    }
+};
+
+namespace impl {
+// below type indicate the data type used for buffer load inline asm
+// clang-format off
+template<index_t N, typename T> struct smem_load_trait;
+
+template<typename T> struct smem_load_trait<16, T> { using payload_t = fp32x4_t; };
+template<typename T> struct smem_load_trait<8 , T> { using payload_t = fp32x2_t; };
+template<typename T> struct smem_load_trait<4 , T> { using payload_t = float; };
+template<typename T> struct smem_load_trait<2 , T> { using payload_t = float; };
+template<typename T> struct smem_load_trait<1 , T> { using payload_t = float; };
+
+// clang-format on
+} // namespace impl
+
+// NOTE: smem load/store no need pre_nop to make sure dependency by sw, happy :)
+template <index_t>
+struct smem_load;
+
+template <>
+struct smem_load<16>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset)
+    {
+        static_assert(sizeof(T) == 16);
+        using mbuf_t = typename impl::smem_load_trait<16, T>::payload_t;
+        asm volatile("ds_read_b128 %0, %1 offset:%2"
+                     : "=v"(reinterpret_cast<mbuf_t&>(value)) // ! direct write
+                     : "v"(v_offset), "n"(i_offset)
+                     : "memory");
+    }
+};
+
+template <>
+struct smem_load<8>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset)
+    {
+        static_assert(sizeof(T) == 8);
+        using mbuf_t = typename impl::smem_load_trait<8, T>::payload_t;
+        asm volatile("ds_read_b64 %0, %1 offset:%2"
+                     : "=v"(reinterpret_cast<mbuf_t&>(value)) // ! direct write
+                     : "v"(v_offset), "n"(i_offset)
+                     : "memory");
+    }
+};
+
+template <>
+struct smem_load<4>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset)
+    {
+        static_assert(sizeof(T) == 4);
+        using mbuf_t = typename impl::smem_load_trait<4, T>::payload_t;
+        asm volatile("ds_read_b32 %0, %1 offset:%2"
+                     : "=v"(reinterpret_cast<mbuf_t&>(value)) // ! direct write
+                     : "v"(v_offset), "n"(i_offset)
+                     : "memory");
+    }
+};
+
+template <>
+struct smem_load<2>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset)
+    {
+        static_assert(sizeof(T) == 4); // subdword is buggy, use dword buf and convert manually
+        using mbuf_t = typename impl::smem_load_trait<1, T>::payload_t;
+        asm volatile("ds_read_u16 %0, %1 offset:%2"
+                     : "=v"(reinterpret_cast<mbuf_t&>(value)) // ! direct write
+                     : "v"(v_offset), "n"(i_offset)
+                     : "memory");
+    }
+};
+
+template <>
+struct smem_load<1>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset)
+    {
+        static_assert(sizeof(T) == 4);
+        using mbuf_t = typename impl::smem_load_trait<1, T>::payload_t;
+        asm volatile("ds_read_u8 %0, %1 offset:%2"
+                     : "=v"(reinterpret_cast<mbuf_t&>(value)) // ! direct write
+                     : "v"(v_offset), "n"(i_offset)
+                     : "memory");
+    }
+};
+
+// clang-format off
+namespace impl{
+
+// can't use "+v" since there could be potential extra move(read/write)
+// use "v" can help remove such duplicated moves
+// besides, fake this as "memory" operation to force later valu after this fence
+// TODO: may have scratch (because this is memory?)
+//       need to reduce extra move inside compiler
+template<index_t N>
+CK_TILE_DEVICE void insert_dummy_dep_per_dword(array<float, N>& b)
+{
+    constexpr auto kSize = remove_cvref_t<decltype(b)>::size(); 
+    static_for<0, kSize, 1>{}([&](auto i){
+        asm volatile(" " : : "v"(b.get(number<i>{})) : "memory");
+    });
+}
+#if 1
+// below specialization just merge size() of dwords into single section
+template<>
+CK_TILE_DEVICE void insert_dummy_dep_per_dword<2>(array<float, 2>& b)
+{
+    asm volatile(" " : : "v"(b.get(number<0>{})), "v"(b.get(number<1>{})) : "memory");
+}
+
+template<>
+CK_TILE_DEVICE void insert_dummy_dep_per_dword<3>(array<float, 3>& b)
+{
+    asm volatile(" " : : "v"(b.get(number<0>{})), "v"(b.get(number<1>{})), "v"(b.get(number<2>{})) : "memory");
+}
+
+template<>
+CK_TILE_DEVICE void insert_dummy_dep_per_dword<4>(array<float, 4>& b)
+{
+    asm volatile(" " : : "v"(b.get(number<0>{})), "v"(b.get(number<1>{})), "v"(b.get(number<2>{})), "v"(b.get(number<3>{})) : "memory");
+}
+
+template<>
+CK_TILE_DEVICE void insert_dummy_dep_per_dword<8>(array<float, 8>& b)
+{
+    asm volatile(" " : : "v"(b.get(number<0>{})), "v"(b.get(number<1>{})), "v"(b.get(number<2>{})), "v"(b.get(number<3>{})),
+                         "v"(b.get(number<4>{})), "v"(b.get(number<5>{})), "v"(b.get(number<6>{})), "v"(b.get(number<7>{})) : "memory");
+}
+
+template<>
+CK_TILE_DEVICE void insert_dummy_dep_per_dword<16>(array<float, 16>& b)
+{
+    asm volatile(" " : : "v"(b.get(number<0>{})), "v"(b.get(number<1>{})), "v"(b.get(number<2>{})), "v"(b.get(number<3>{})),
+                         "v"(b.get(number<4>{})), "v"(b.get(number<5>{})), "v"(b.get(number<6>{})), "v"(b.get(number<7>{})),
+                         "v"(b.get(number<8>{})), "v"(b.get(number<9>{})), "v"(b.get(number<10>{})), "v"(b.get(number<11>{})),
+                         "v"(b.get(number<12>{})), "v"(b.get(number<13>{})), "v"(b.get(number<14>{})), "v"(b.get(number<15>{})) : "memory");
+}
+
+template<>
+CK_TILE_DEVICE void insert_dummy_dep_per_dword<32>(array<float, 32>& b)
+{
+    asm volatile(" " : : "v"(b.get(number<0>{})), "v"(b.get(number<1>{})), "v"(b.get(number<2>{})), "v"(b.get(number<3>{})),
+                         "v"(b.get(number<4>{})), "v"(b.get(number<5>{})), "v"(b.get(number<6>{})), "v"(b.get(number<7>{})),
+                         "v"(b.get(number<8>{})), "v"(b.get(number<9>{})), "v"(b.get(number<10>{})), "v"(b.get(number<11>{})),
+                         "v"(b.get(number<12>{})), "v"(b.get(number<13>{})), "v"(b.get(number<14>{})), "v"(b.get(number<15>{})),
+                         "v"(b.get(number<16>{})), "v"(b.get(number<17>{})), "v"(b.get(number<18>{})), "v"(b.get(number<19>{})),
+                         "v"(b.get(number<20>{})), "v"(b.get(number<21>{})), "v"(b.get(number<22>{})), "v"(b.get(number<23>{})),
+                         "v"(b.get(number<24>{})), "v"(b.get(number<25>{})), "v"(b.get(number<26>{})), "v"(b.get(number<27>{})),
+                         "v"(b.get(number<28>{})), "v"(b.get(number<29>{})), "v"(b.get(number<30>{})), "v"(b.get(number<31>{})) : "memory");
+}
+#endif
+CK_TILE_DEVICE void insert_dummy_dep() {}
+
+template<typename T>
+CK_TILE_DEVICE void insert_dummy_dep(T & buffer)
+{
+    // TODO: indeed we expect T to be multiple of dword. subdword is always buggy
+    using da_type = array<float, (sizeof(T) + 3) / 4>;
+    auto & dummy = reinterpret_cast<da_type&>(buffer);
+    insert_dummy_dep_per_dword(dummy);
+}
+
+template<typename Tx, typename... Ty>
+CK_TILE_DEVICE void insert_dummy_dep(Tx& bx, Ty&... by)
+{
+    insert_dummy_dep(bx);
+    insert_dummy_dep(by...);
+}
+}
+// clang-format on
+template <typename... T>
+CK_TILE_DEVICE void buffer_load_fence(index_t cnt = 0, T&... o)
+{
+    asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
+    impl::insert_dummy_dep(o...);
+}
+
+CK_TILE_DEVICE void buffer_store_fence(index_t cnt = 0)
+{
+    asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
+}
+
+CK_TILE_DEVICE auto async_load_fence_raw(index_t cnt = 0)
+{
+    asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
+}
+
+// buffer load i8
+CK_TILE_DEVICE_EXTERN int8_t
+llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc,
+                               index_t voffset,
+                               index_t soffset,
+                               index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8.v4i32");
+
+CK_TILE_DEVICE_EXTERN int8x2_t
+llvm_amdgcn_raw_buffer_load_i8x2(int32x4_t srsrc,
+                                 index_t voffset,
+                                 index_t soffset,
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8.v4i32");
+
+CK_TILE_DEVICE_EXTERN int8x4_t
+llvm_amdgcn_raw_buffer_load_i8x4(int32x4_t srsrc,
+                                 index_t voffset,
+                                 index_t soffset,
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8.v4i32");
+
+// buffer load i16
+CK_TILE_DEVICE_EXTERN int16_t
+llvm_amdgcn_raw_buffer_load_i16(int32x4_t srsrc,
+                                index_t voffset,
+                                index_t soffset,
+                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16.v4i32");
+
+CK_TILE_DEVICE_EXTERN int16x2_t
+llvm_amdgcn_raw_buffer_load_i16x2(int32x4_t srsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16.v4i32");
+
+CK_TILE_DEVICE_EXTERN int16x4_t
+llvm_amdgcn_raw_buffer_load_i16x4(int32x4_t srsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16.v4i32");
+
+// buffer load i32
+CK_TILE_DEVICE_EXTERN int32_t
+llvm_amdgcn_raw_buffer_load_i32(int32x4_t srsrc,
+                                index_t voffset,
+                                index_t soffset,
+                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32.v4i32");
+
+CK_TILE_DEVICE_EXTERN int32x2_t
+llvm_amdgcn_raw_buffer_load_i32x2(int32x4_t srsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32.v4i32");
+
+CK_TILE_DEVICE_EXTERN int32x4_t
+llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32.v4i32");
+
+// buffer load fp16
+CK_TILE_DEVICE_EXTERN _Float16
+llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc,
+                                 index_t voffset,
+                                 index_t soffset,
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16.v4i32");
+
+CK_TILE_DEVICE_EXTERN fp16x2_t llvm_amdgcn_raw_buffer_load_fp16x2(
+    int32x4_t srsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16.v4i32");
+
+CK_TILE_DEVICE_EXTERN fp16x4_t llvm_amdgcn_raw_buffer_load_fp16x4(
+    int32x4_t srsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16.v4i32");
+
+// buffer load fp32
+CK_TILE_DEVICE_EXTERN float
+llvm_amdgcn_raw_buffer_load_fp32(int32x4_t srsrc,
+                                 index_t voffset,
+                                 index_t soffset,
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32.v4i32");
+
+CK_TILE_DEVICE_EXTERN fp32x2_t llvm_amdgcn_raw_buffer_load_fp32x2(
+    int32x4_t srsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32.v4i32");
+
+CK_TILE_DEVICE_EXTERN fp32x4_t llvm_amdgcn_raw_buffer_load_fp32x4(
+    int32x4_t srsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32.v4i32");
+
+// buffer store i8
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_i8(int8_t vdata,
+                                int32x4_t rsrc,
+                                index_t voffset,
+                                index_t soffset,
+                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8.v4i32");
+
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_i8x2(int8x2_t vdata,
+                                  int32x4_t rsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8.v4i32");
+
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_i8x4(int8x4_t vdata,
+                                  int32x4_t rsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8.v4i32");
+
+// buffer store i16
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_i16(int16_t vdata,
+                                 int32x4_t rsrc,
+                                 index_t voffset,
+                                 index_t soffset,
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16.v4i32");
+
+CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i16x2(
+    int16x2_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16.v4i32");
+
+CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i16x4(
+    int16x4_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16.v4i32");
+
+// buffer store i32
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_i32(int32_t vdata,
+                                 int32x4_t rsrc,
+                                 index_t voffset,
+                                 index_t soffset,
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32.v4i32");
+
+// buffer store ui16
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_ui16(uint16_t vdata,
+                                  int32x4_t rsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16.v4i32");
+
+CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_ui16x2(
+    uint16x2_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16.v4i32");
+
+CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_ui16x4(
+    uint16x4_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16.v4i32");
+
+CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i32x2(
+    int32x2_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32.v4i32");
+
+CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i32x4(
+    int32x4_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32.v4i32");
+
+// buffer store fp16
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_fp16(_Float16 vdata,
+                                  int32x4_t rsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16.v4i32");
+
+CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp16x2(
+    fp16x2_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16.v4i32");
+
+CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp16x4(
+    fp16x4_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16.v4i32");
+
+// buffer store fp32
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_fp32(float vdata,
+                                  int32x4_t rsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32.v4i32");
+
+CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp32x2(
+    fp32x2_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32.v4i32");
+
+CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp32x4(
+    fp32x4_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32.v4i32");
+
+// buffer atomic-add fp16
+CK_TILE_DEVICE_EXTERN fp16x2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(
+    fp16x2_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16.v4i32");
+
+// buffer atomic-add i32
+CK_TILE_DEVICE_EXTERN int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(
+    int32_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32.v4i32");
+
+// buffer atomic-add fp32
+CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_atomic_add_fp32(
+    float vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32.v4i32");
+
+// buffer atomic-max fp64
+CK_TILE_DEVICE_EXTERN double llvm_amdgcn_raw_buffer_atomic_max_fp64(
+    double vdata,
+    int32x4_t rsrc, // dst_wave_buffer_resource
+    int voffset,    // dst_thread_addr_offset
+    int soffset,    // dst_wave_addr_offset
+    int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64.v4i32");
+
+// Direct loads from global to LDS.
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc,
+                                __attribute__((address_space(3))) uint32_t* lds_ptr,
+                                index_t size,
+                                index_t voffset,
+                                index_t soffset,
+                                index_t offset,
+                                index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds.v4i32");
+
+template <bool pre_nop = false>
+CK_TILE_DEVICE void async_buffer_load_dword_v(void* smem,
+                                              int32x4_t rsrc,
+                                              index_t voffset,
+                                              index_t /*soffset*/,
+                                              index_t ioffset /*max 0xFFF*/,
+                                              index_t /*flag*/       = 0,
+                                              bool_constant<pre_nop> = {})
+{
+    if constexpr(pre_nop)
+        asm volatile("s_nop 4\n"
+                     "buffer_load_dword %1, %2, 0 offen offset:%3 lds"
+                     : "=r"(smem) /*dummy dependency for smem*/
+                     : "v"(voffset), "s"(rsrc), "n"(ioffset)
+                     : "memory");
+    else
+        asm volatile("buffer_load_dword %1, %2, 0 offen offset:%3 lds"
+                     : "=r"(smem) /*dummy dependency for smem*/
+                     : "v"(voffset), "s"(rsrc), "n"(ioffset)
+                     : "memory");
+}
+
+CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0)
+{
+    asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
+}
+
+// memory coherency bit for buffer store/load instruction
+// check ISA manual for each GFX target
+// e.g. for
+// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
+// page 67~68
+enum struct amd_buffer_coherence_enum
+{
+    coherence_default = 0, // default value
+    glc               = 1,
+    slc               = 2,
+    glc_slc           = 3,
+};
+
+template <index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
+CK_TILE_DEVICE thread_buffer<int8_t, N>
+amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
+                                index_t src_thread_addr_offset,
+                                index_t src_wave_addr_offset)
+{
+    static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
+                  "wrong! not implemented");
+
+    using rtn_type = thread_buffer<int8_t, N>;
+
+    if constexpr(N == 1)
+    {
+        return bit_cast<rtn_type>(llvm_amdgcn_raw_buffer_load_i8(src_wave_buffer_resource,
+                                                                 src_thread_addr_offset,
+                                                                 src_wave_addr_offset,
+                                                                 static_cast<index_t>(coherence)));
+    }
+    else if constexpr(N == 2)
+    {
+
+        int16_t tmp = llvm_amdgcn_raw_buffer_load_i16(src_wave_buffer_resource,
+                                                      src_thread_addr_offset,
+                                                      src_wave_addr_offset,
+                                                      static_cast<index_t>(coherence));
+
+        return bit_cast<rtn_type>(tmp);
+    }
+    else if constexpr(N == 4)
+    {
+        int32_t tmp = llvm_amdgcn_raw_buffer_load_i32(src_wave_buffer_resource,
+                                                      src_thread_addr_offset,
+                                                      src_wave_addr_offset,
+                                                      static_cast<index_t>(coherence));
+
+        return bit_cast<rtn_type>(tmp);
+    }
+    else if constexpr(N == 8)
+    {
+        int32x2_t tmp = llvm_amdgcn_raw_buffer_load_i32x2(src_wave_buffer_resource,
+                                                          src_thread_addr_offset,
+                                                          src_wave_addr_offset,
+                                                          static_cast<index_t>(coherence));
+
+        return bit_cast<rtn_type>(tmp);
+    }
+    else if constexpr(N == 16)
+    {
+        int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
+                                                          src_thread_addr_offset,
+                                                          src_wave_addr_offset,
+                                                          static_cast<index_t>(coherence));
+        return bit_cast<rtn_type>(tmp);
+    }
+    else if constexpr(N == 32)
+    {
+        int32x4_t tmp0 = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
+                                                           src_thread_addr_offset,
+                                                           src_wave_addr_offset,
+                                                           static_cast<index_t>(coherence));
+        int32x4_t tmp1 =
+            llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
+                                              src_thread_addr_offset,
+                                              src_wave_addr_offset + 4 * sizeof(int32_t),
+                                              static_cast<index_t>(coherence));
+        thread_buffer<int32_t, 8> tmp;
+
+        tmp.template get_as<int32x4_t>()(number<0>{}) = tmp0;
+        tmp.template get_as<int32x4_t>()(number<1>{}) = tmp1;
+
+        return bit_cast<rtn_type>(tmp);
+    }
+    else if constexpr(N == 64)
+    {
+        int32x4_t tmp0 = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
+                                                           src_thread_addr_offset,
+                                                           src_wave_addr_offset,
+                                                           static_cast<index_t>(coherence));
+        int32x4_t tmp1 =
+            llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
+                                              src_thread_addr_offset,
+                                              src_wave_addr_offset + 4 * sizeof(int32_t),
+                                              static_cast<index_t>(coherence));
+        int32x4_t tmp2 =
+            llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
+                                              src_thread_addr_offset,
+                                              src_wave_addr_offset + 8 * sizeof(int32_t),
+                                              static_cast<index_t>(coherence));
+        int32x4_t tmp3 =
+            llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
+                                              src_thread_addr_offset,
+                                              src_wave_addr_offset + 12 * sizeof(int32_t),
+                                              static_cast<index_t>(coherence));
+
+        thread_buffer<int32_t, 16> tmp;
+
+        tmp.template get_as<int32x4_t>()(number<0>{}) = tmp0;
+        tmp.template get_as<int32x4_t>()(number<1>{}) = tmp1;
+        tmp.template get_as<int32x4_t>()(number<2>{}) = tmp2;
+        tmp.template get_as<int32x4_t>()(number<3>{}) = tmp3;
+
+        return bit_cast<rtn_type>(tmp);
+    }
+}
+
+#ifndef BUFFER_LOAD_USE_INLINEASM
+#define BUFFER_LOAD_USE_INLINEASM 0
+#endif
+
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
+CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffer_resource,
+                                                        index_t src_thread_addr_offset,
+                                                        index_t src_wave_addr_offset)
+{
+    static_assert(
+        (std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
+            (std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, fp16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
+            (std::is_same<T, bf16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
+            (std::is_same<T, int32_t>::value &&
+             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, pk_int4_t>::value &&
+             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32)),
+        "wrong! not implemented");
+
+    using rtn_type = thread_buffer<T, N>;
+
+    if constexpr(std::is_same<T, float>::value) // fp32
+    {
+        if constexpr(N == 1)
+        {
+            return bit_cast<rtn_type>(
+                llvm_amdgcn_raw_buffer_load_fp32(src_wave_buffer_resource,
+                                                 src_thread_addr_offset,
+                                                 src_wave_addr_offset,
+                                                 static_cast<index_t>(coherence)));
+        }
+        else if constexpr(N == 2)
+        {
+            return bit_cast<rtn_type>(
+                llvm_amdgcn_raw_buffer_load_fp32x2(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence)));
+        }
+        else if constexpr(N == 4)
+        {
+            return bit_cast<rtn_type>(
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence)));
+        }
+        else if constexpr(N == 8)
+        {
+            thread_buffer<float, 8> tmp;
+
+            tmp.template get_as<fp32x4_t>()(number<0>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence));
+
+            tmp.template get_as<fp32x4_t>()(number<1>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset + 4 * sizeof(float),
+                                                   static_cast<index_t>(coherence));
+
+            return tmp;
+        }
+        else if constexpr(N == 16)
+        {
+            thread_buffer<float, 16> tmp;
+
+            tmp.template get_as<fp32x4_t>()(number<0>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence));
+
+            tmp.template get_as<fp32x4_t>()(number<1>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset + 4 * sizeof(float),
+                                                   static_cast<index_t>(coherence));
+
+            tmp.template get_as<fp32x4_t>()(number<2>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset + 8 * sizeof(float),
+                                                   static_cast<index_t>(coherence));
+
+            tmp.template get_as<fp32x4_t>()(number<3>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset + 12 * sizeof(float),
+                                                   static_cast<index_t>(coherence));
+
+            return tmp;
+        }
+    }
+    else if constexpr(std::is_same<T, fp16_t>::value) // fp16
+    {
+        if constexpr(N == 1)
+        {
+            return bit_cast<rtn_type>(
+                llvm_amdgcn_raw_buffer_load_fp16(src_wave_buffer_resource,
+                                                 src_thread_addr_offset,
+                                                 src_wave_addr_offset,
+                                                 static_cast<index_t>(coherence)));
+        }
+        else if constexpr(N == 2)
+        {
+            return bit_cast<rtn_type>(
+                llvm_amdgcn_raw_buffer_load_fp16x2(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence)));
+        }
+        else if constexpr(N == 4)
+        {
+            return bit_cast<rtn_type>(
+                llvm_amdgcn_raw_buffer_load_fp16x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence)));
+        }
+        else if constexpr(N == 8)
+        {
+            // use fp32 load to mimic fp16 load
+            fp32x4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                              src_thread_addr_offset,
+                                                              src_wave_addr_offset,
+                                                              static_cast<index_t>(coherence));
+
+            return bit_cast<rtn_type>(tmp);
+        }
+    }
+    else if constexpr(std::is_same<T, bf16_t>::value) // bf16
+    {
+        if constexpr(N == 1)
+        {
+            return bit_cast<rtn_type>(
+                llvm_amdgcn_raw_buffer_load_i16(src_wave_buffer_resource,
+                                                src_thread_addr_offset,
+                                                src_wave_addr_offset,
+                                                static_cast<index_t>(coherence)));
+        }
+        else if constexpr(N == 2)
+        {
+            return bit_cast<rtn_type>(
+                llvm_amdgcn_raw_buffer_load_i16x2(src_wave_buffer_resource,
+                                                  src_thread_addr_offset,
+                                                  src_wave_addr_offset,
+                                                  static_cast<index_t>(coherence)));
+        }
+        else if constexpr(N == 4)
+        {
+            return bit_cast<rtn_type>(
+                llvm_amdgcn_raw_buffer_load_i16x4(src_wave_buffer_resource,
+                                                  src_thread_addr_offset,
+                                                  src_wave_addr_offset,
+                                                  static_cast<index_t>(coherence)));
+        }
+        else if constexpr(N == 8)
+        {
+            int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
+                                                              src_thread_addr_offset,
+                                                              src_wave_addr_offset,
+                                                              static_cast<index_t>(coherence));
+
+            return bit_cast<rtn_type>(tmp);
+        }
+    }
+    else // other datatype
+    {
+        auto raw_data = amd_buffer_load_impl_with_bytes<sizeof(T) * N, coherence>(
+            src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset);
+
+        return bit_cast<rtn_type>(raw_data);
+    }
+}
+
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool oob_conditional_check          = true,
+          bool pre_nop                        = false>
+CK_TILE_DEVICE void amd_buffer_load_raw_impl(thread_buffer<T, N>& dst,
+                                             int32x4_t src_wave_buffer_resource,
+                                             index_t src_thread_addr_offset,
+                                             index_t src_wave_addr_offset,
+                                             index_t src_linear_addr_offset,
+                                             index_t flag           = 0,
+                                             bool_constant<pre_nop> = {})
+{
+    constexpr index_t bytes = sizeof(T) * N;
+    static_assert(bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8 || bytes == 16,
+                  "wrong! not supported by buffer_load instruction");
+
+    using type = thread_buffer<T, N>;
+    if constexpr(oob_conditional_check)
+    {
+        buffer_load_if<sizeof(type), pre_nop>{}(dst,
+                                                src_wave_buffer_resource,
+                                                src_thread_addr_offset,
+                                                src_wave_addr_offset,
+                                                src_linear_addr_offset,
+                                                flag,
+                                                bool_constant<pre_nop>{});
+    }
+    else
+    {
+        buffer_load<sizeof(type), pre_nop>{}(dst,
+                                             src_wave_buffer_resource,
+                                             src_thread_addr_offset,
+                                             src_wave_addr_offset,
+                                             src_linear_addr_offset,
+                                             flag,
+                                             bool_constant<pre_nop>{});
+    }
+}
+
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool pre_nop                        = false>
+CK_TILE_DEVICE void amd_async_buffer_load_impl(T* smem,
+                                               int32x4_t src_wave_buffer_resource,
+                                               index_t src_thread_addr_offset,
+                                               index_t src_wave_addr_offset,
+                                               index_t src_immediate_addr_offset = 0,
+                                               bool_constant<pre_nop>            = {})
+{
+    static_assert(sizeof(T) * N == 4, "wrong! not implemented vector size");
+
+    async_buffer_load_dword_v(smem,
+                              src_wave_buffer_resource,
+                              src_thread_addr_offset,
+                              src_wave_addr_offset,
+                              src_immediate_addr_offset,
+                              0,
+                              bool_constant<pre_nop>{});
+}
+
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool oob_conditional_check          = true>
+CK_TILE_DEVICE void amd_async_buffer_load(CK_TILE_LDS_ADDR T* smem,
+                                          int32x4_t src_wave_buffer_resource,
+                                          index_t src_thread_addr_offset,
+                                          index_t src_wave_addr_offset,
+                                          index_t src_immediate_addr_offset    = 0,
+                                          index_t flag                         = 0,
+                                          bool_constant<oob_conditional_check> = {})
+{
+    static_assert(sizeof(T) * N == 4, "wrong! not implemented vector size");
+
+    if constexpr(oob_conditional_check)
+    {
+        index_t v_offset = flag ? v_offset : src_wave_buffer_resource[2];
+        llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource,
+                                        smem,
+                                        sizeof(uint32_t),
+                                        v_offset,
+                                        src_wave_addr_offset,
+                                        src_immediate_addr_offset,
+                                        static_cast<index_t>(coherence));
+    }
+    else
+    {
+        llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource,
+                                        smem,
+                                        sizeof(uint32_t),
+                                        src_thread_addr_offset,
+                                        src_wave_addr_offset,
+                                        src_immediate_addr_offset,
+                                        static_cast<index_t>(coherence));
+    }
+}
+
+template <index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
+CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t, N> src_thread_data,
+                                                     int32x4_t dst_wave_buffer_resource,
+                                                     index_t dst_thread_addr_offset,
+                                                     index_t dst_wave_addr_offset)
+{
+    static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
+                  "wrong! not implemented");
+
+    if constexpr(N == 1)
+    {
+        llvm_amdgcn_raw_buffer_store_i8(bit_cast<int8_t>(src_thread_data),
+                                        dst_wave_buffer_resource,
+                                        dst_thread_addr_offset,
+                                        dst_wave_addr_offset,
+                                        static_cast<index_t>(coherence));
+    }
+    else if constexpr(N == 2)
+    {
+
+        llvm_amdgcn_raw_buffer_store_i16(bit_cast<int16_t>(src_thread_data),
+                                         dst_wave_buffer_resource,
+                                         dst_thread_addr_offset,
+                                         dst_wave_addr_offset,
+                                         static_cast<index_t>(coherence));
+    }
+    else if constexpr(N == 4)
+    {
+        llvm_amdgcn_raw_buffer_store_i32(bit_cast<int32_t>(src_thread_data),
+                                         dst_wave_buffer_resource,
+                                         dst_thread_addr_offset,
+                                         dst_wave_addr_offset,
+                                         static_cast<index_t>(coherence));
+    }
+    else if constexpr(N == 8)
+    {
+        llvm_amdgcn_raw_buffer_store_i32x2(bit_cast<int32x2_t>(src_thread_data),
+                                           dst_wave_buffer_resource,
+                                           dst_thread_addr_offset,
+                                           dst_wave_addr_offset,
+                                           static_cast<index_t>(coherence));
+    }
+    else if constexpr(N == 16)
+    {
+        llvm_amdgcn_raw_buffer_store_i32x4(bit_cast<int32x4_t>(src_thread_data),
+                                           dst_wave_buffer_resource,
+                                           dst_thread_addr_offset,
+                                           dst_wave_addr_offset,
+                                           static_cast<index_t>(coherence));
+    }
+    else if constexpr(N == 32)
+    {
+        llvm_amdgcn_raw_buffer_store_i32x4(
+            src_thread_data.template get_as<int32x4_t>()[number<0>{}],
+            dst_wave_buffer_resource,
+            dst_thread_addr_offset,
+            dst_wave_addr_offset,
+            static_cast<index_t>(coherence));
+
+        llvm_amdgcn_raw_buffer_store_i32x4(
+            src_thread_data.template get_as<int32x4_t>()[number<1>{}],
+            dst_wave_buffer_resource,
+            dst_thread_addr_offset,
+            dst_wave_addr_offset + sizeof(int32_t) * 4,
+            static_cast<index_t>(coherence));
+    }
+    else if constexpr(N == 64)
+    {
+        llvm_amdgcn_raw_buffer_store_i32x4(
+            src_thread_data.template get_as<int32x4_t>()[number<0>{}],
+            dst_wave_buffer_resource,
+            dst_thread_addr_offset,
+            dst_wave_addr_offset,
+            static_cast<index_t>(coherence));
+
+        llvm_amdgcn_raw_buffer_store_i32x4(
+            src_thread_data.template get_as<int32x4_t>()[number<1>{}],
+            dst_wave_buffer_resource,
+            dst_thread_addr_offset,
+            dst_wave_addr_offset + sizeof(int32_t) * 4,
+            static_cast<index_t>(coherence));
+
+        llvm_amdgcn_raw_buffer_store_i32x4(
+            src_thread_data.template get_as<int32x4_t>()[number<2>{}],
+            dst_wave_buffer_resource,
+            dst_thread_addr_offset,
+            dst_wave_addr_offset + sizeof(int32_t) * 8,
+            static_cast<index_t>(coherence));
+
+        llvm_amdgcn_raw_buffer_store_i32x4(
+            src_thread_data.template get_as<int32x4_t>()[number<3>{}],
+            dst_wave_buffer_resource,
+            dst_thread_addr_offset,
+            dst_wave_addr_offset + sizeof(int32_t) * 12,
+            static_cast<index_t>(coherence));
+    }
+}
+
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
+CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_data,
+                                          int32x4_t dst_wave_buffer_resource,
+                                          index_t dst_thread_addr_offset,
+                                          index_t dst_wave_addr_offset)
+{
+    static_assert(
+        (std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
+            (std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, fp16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, bf16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, int32_t>::value &&
+             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, uint16_t>::value &&
+             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, uint8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
+        "wrong! not implemented");
+
+    if constexpr(std::is_same<T, float>::value) // fp32
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_store_fp32(bit_cast<float>(src_thread_data),
+                                              dst_wave_buffer_resource,
+                                              dst_thread_addr_offset,
+                                              dst_wave_addr_offset,
+                                              static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_store_fp32x2(bit_cast<fp32x2_t>(src_thread_data),
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 4)
+        {
+            llvm_amdgcn_raw_buffer_store_fp32x4(bit_cast<fp32x4_t>(src_thread_data),
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 8)
+        {
+            llvm_amdgcn_raw_buffer_store_fp32x4(
+                src_thread_data.template get_as<fp32x4_t>()[number<0>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset,
+                static_cast<index_t>(coherence));
+            llvm_amdgcn_raw_buffer_store_fp32x4(
+                src_thread_data.template get_as<fp32x4_t>()[number<1>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + 4 * sizeof(float),
+                static_cast<index_t>(coherence));
+        }
+    }
+    else if constexpr(std::is_same<T, fp16_t>::value) // fp16
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_store_fp16(bit_cast<_Float16>(src_thread_data),
+                                              dst_wave_buffer_resource,
+                                              dst_thread_addr_offset,
+                                              dst_wave_addr_offset,
+                                              static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_store_fp16x2(bit_cast<fp16x2_t>(src_thread_data),
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 4)
+        {
+            llvm_amdgcn_raw_buffer_store_fp16x4(bit_cast<fp16x4_t>(src_thread_data),
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 8)
+        {
+#if 0
+            thread_buffer<fp16_t, 8> tmp{src_thread_data};
+
+            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as<fp16x4_t>()[number<0>{}],
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                static_cast<index_t>(coherence));
+
+            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as<fp16x4_t>()[number<1>{}],
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset + 4 * sizeof(fp16_t),
+                                                static_cast<index_t>(coherence));
+#else
+            llvm_amdgcn_raw_buffer_store_fp32x4(bit_cast<fp32x4_t>(src_thread_data),
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                static_cast<index_t>(coherence));
+#endif
+        }
+    }
+    else if constexpr(std::is_same<T, bf16_t>::value) // bf16
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_store_i16(bit_cast<int16_t>(src_thread_data),
+                                             dst_wave_buffer_resource,
+                                             dst_thread_addr_offset,
+                                             dst_wave_addr_offset,
+                                             static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_store_i16x2(bit_cast<int16x2_t>(src_thread_data),
+                                               dst_wave_buffer_resource,
+                                               dst_thread_addr_offset,
+                                               dst_wave_addr_offset,
+                                               static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 4)
+        {
+            llvm_amdgcn_raw_buffer_store_i16x4(bit_cast<int16x4_t>(src_thread_data),
+                                               dst_wave_buffer_resource,
+                                               dst_thread_addr_offset,
+                                               dst_wave_addr_offset,
+                                               static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 8)
+        {
+            llvm_amdgcn_raw_buffer_store_i16x4(
+                src_thread_data.template get_as<int16x4_t>()[number<0>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset,
+                static_cast<index_t>(coherence));
+
+            llvm_amdgcn_raw_buffer_store_i16x4(
+                src_thread_data.template get_as<int16x4_t>()[number<1>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + 4 * sizeof(bf16_t),
+                static_cast<index_t>(coherence));
+        }
+    }
+    else if constexpr(std::is_same<T, uint16_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_store_ui16(bit_cast<uint16_t>(src_thread_data),
+                                              dst_wave_buffer_resource,
+                                              dst_thread_addr_offset,
+                                              dst_wave_addr_offset,
+                                              static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_store_ui16x2(bit_cast<uint16x2_t>(src_thread_data),
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 4)
+        {
+            llvm_amdgcn_raw_buffer_store_ui16x4(bit_cast<uint16x4_t>(src_thread_data),
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 8)
+        {
+            llvm_amdgcn_raw_buffer_store_ui16x4(
+                src_thread_data.template get_as<uint16x4_t>()[number<0>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset,
+                static_cast<index_t>(coherence));
+
+            llvm_amdgcn_raw_buffer_store_ui16x4(
+                src_thread_data.template get_as<uint16x4_t>()[number<1>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + 4 * sizeof(uint16_t),
+                static_cast<index_t>(coherence));
+        }
+    }
+    else
+    {
+        using r_t = thread_buffer<int8_t, sizeof(T) * N>;
+
+        amd_buffer_store_impl_with_bytes<sizeof(T) * N, coherence>(bit_cast<r_t>(src_thread_data),
+                                                                   dst_wave_buffer_resource,
+                                                                   dst_thread_addr_offset,
+                                                                   dst_wave_addr_offset);
+    }
+}
+
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool oob_conditional_check          = true>
+CK_TILE_DEVICE void amd_buffer_store_raw_impl(const thread_buffer<T, N>& dst_thread_data,
+                                              int32x4_t dst_wave_buffer_resource,
+                                              index_t dst_thread_addr_offset,
+                                              index_t dst_wave_addr_offset,
+                                              index_t dst_linear_addr_offset,
+                                              index_t is_valid_element = 1)
+{
+    constexpr index_t bytes = sizeof(T) * N;
+    static_assert(bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8 || bytes == 16,
+                  "wrong! not supported by buffer_store instruction");
+
+    using type = thread_buffer<T, N>;
+    if constexpr(oob_conditional_check)
+    {
+        buffer_store_if<sizeof(type)>{}(dst_thread_data,
+                                        dst_wave_buffer_resource,
+                                        dst_thread_addr_offset,
+                                        dst_wave_addr_offset,
+                                        dst_linear_addr_offset,
+                                        is_valid_element);
+    }
+    else
+    {
+        buffer_store<sizeof(type)>{}(dst_thread_data,
+                                     dst_wave_buffer_resource,
+                                     dst_thread_addr_offset,
+                                     dst_wave_addr_offset,
+                                     dst_linear_addr_offset);
+    }
+}
+
+template <typename T, index_t N>
+CK_TILE_DEVICE void amd_buffer_atomic_add_impl(const thread_buffer<T, N>& src_thread_data,
+                                               int32x4_t dst_wave_buffer_resource,
+                                               index_t dst_thread_addr_offset,
+                                               index_t dst_wave_addr_offset)
+{
+    static_assert((std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
+                      (std::is_same<T, fp16_t>::value && (N == 2 || N == 4 || N == 8)) ||
+                      (std::is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)),
+                  "wrong! not implemented");
+
+    if constexpr(std::is_same<T, float>::value)
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(bit_cast<float>(src_thread_data),
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset,
+                                                   0);
+        }
+        else if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(
+                src_thread_data.template get_as<float>()[number<0>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset,
+                0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(
+                src_thread_data.template get_as<float>()[number<1>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + sizeof(float),
+                0);
+        }
+        else if constexpr(N == 4)
+        {
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(
+                src_thread_data.template get_as<float>()[number<0>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset,
+                0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(
+                src_thread_data.template get_as<float>()[number<1>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + sizeof(float),
+                0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(
+                src_thread_data.template get_as<float>()[number<2>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + 2 * sizeof(float),
+                0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(
+                src_thread_data.template get_as<float>()[number<3>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + 3 * sizeof(float),
+                0);
+        }
+    }
+    else if constexpr(std::is_same<T, fp16_t>::value)
+    {
+        if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_atomic_add_fp16x2(bit_cast<fp16x2_t>(src_thread_data),
+                                                     dst_wave_buffer_resource,
+                                                     dst_thread_addr_offset,
+                                                     dst_wave_addr_offset,
+                                                     0);
+        }
+        else if constexpr(N == 4)
+        {
+            static_for<0, 2, 1>{}([&](auto i) {
+                llvm_amdgcn_raw_buffer_atomic_add_fp16x2(
+                    src_thread_data.template get_as<fp16x2_t>()[i],
+                    dst_wave_buffer_resource,
+                    dst_thread_addr_offset,
+                    dst_wave_addr_offset + i * sizeof(fp16x2_t),
+                    0);
+            });
+        }
+        else if constexpr(N == 8)
+        {
+            static_for<0, 4, 1>{}([&](auto i) {
+                llvm_amdgcn_raw_buffer_atomic_add_fp16x2(
+                    src_thread_data.template get_as<fp16x2_t>()[i],
+                    dst_wave_buffer_resource,
+                    dst_thread_addr_offset,
+                    dst_wave_addr_offset + i * sizeof(fp16x2_t),
+                    0);
+            });
+        }
+    }
+    else if constexpr(std::is_same<T, int32_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_atomic_add_i32(bit_cast<int32_t>(src_thread_data),
+                                                  dst_wave_buffer_resource,
+                                                  dst_thread_addr_offset,
+                                                  dst_wave_addr_offset,
+                                                  0);
+        }
+        else if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_atomic_add_i32(
+                src_thread_data.template get_as<int32_t>()[number<0>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset,
+                0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_i32(
+                src_thread_data.template get_as<int32_t>()[number<1>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + sizeof(int32_t),
+                0);
+        }
+        else if constexpr(N == 4)
+        {
+            llvm_amdgcn_raw_buffer_atomic_add_i32(
+                src_thread_data.template get_as<int32_t>()[number<0>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset,
+                0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_i32(
+                src_thread_data.template get_as<int32_t>()[number<1>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + sizeof(int32_t),
+                0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_i32(
+                src_thread_data.template get_as<int32_t>()[number<2>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + 2 * sizeof(int32_t),
+                0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_i32(
+                src_thread_data.template get_as<int32_t>()[number<3>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + 3 * sizeof(int32_t),
+                0);
+        }
+    }
+}
+
+template <typename T, index_t N>
+CK_TILE_DEVICE void amd_buffer_atomic_max_impl(const thread_buffer<T, N> src_thread_data,
+                                               int32x4_t dst_wave_buffer_resource,
+                                               index_t dst_thread_addr_offset,
+                                               index_t dst_wave_addr_offset)
+{
+    static_assert((std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4)),
+                  "wrong! not implemented");
+    if constexpr(std::is_same<T, double>::value)
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(bit_cast<double>(src_thread_data),
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset,
+                                                   0);
+        }
+        else if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(
+                src_thread_data.template get_as<double>()[number<0>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset,
+                0);
+
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(
+                src_thread_data.template get_as<double>()[number<1>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + sizeof(double),
+                0);
+        }
+        else if constexpr(N == 4)
+        {
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(
+                src_thread_data.template get_as<double>()[number<0>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset,
+                0);
+
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(
+                src_thread_data.template get_as<double>()[number<1>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + sizeof(double),
+                0);
+
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(
+                src_thread_data.template get_as<double>()[number<2>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + 2 * sizeof(double),
+                0);
+
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(
+                src_thread_data.template get_as<double>()[number<3>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + 3 * sizeof(double),
+                0);
+        }
+    }
+}
+
+// buffer_load requires:
+//   1) p_src_wave must point to global memory space
+//   2) p_src_wave must be a wavewise pointer.
+// It is user's responsibility to make sure that is true.
+//   oob_conditional_check : dynamic check if out-of-bound
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool oob_conditional_check          = true>
+CK_TILE_DEVICE thread_buffer<T, N>
+amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,
+                                            index_t src_thread_element_offset,
+                                            bool src_thread_element_valid,
+                                            index_t src_element_space_size)
+{
+    const int32x4_t src_wave_buffer_resource =
+        make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T));
+
+    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+
+#if CK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
+    uint32_t src_addr_shift = [&]() {
+        if constexpr(oob_conditional_check)
+            return src_thread_element_valid ? 0 : 0x80000000;
+        else
+            return 0;
+    }();
+    return amd_buffer_load_impl<T, N, coherence>(
+        src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
+#else
+    thread_buffer<T, N> tmp =
+        amd_buffer_load_impl<T, N, coherence>(src_wave_buffer_resource, src_thread_addr_offset, 0);
+    if constexpr(oob_conditional_check)
+        return src_thread_element_valid ? tmp : thread_buffer<T, N>{numeric<T>::zero()};
+    else
+        return tmp;
+#endif
+}
+
+// buffer_load requires:
+//   1) p_src_wave must point to global memory space
+//   2) p_src_wave must be a wavewise pointer.
+// It is user's responsibility to make sure that is true.
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool oob_conditional_check          = true>
+CK_TILE_DEVICE thread_buffer<T, N>
+amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,
+                                                        index_t src_thread_element_offset,
+                                                        bool src_thread_element_valid,
+                                                        index_t src_element_space_size,
+                                                        T customized_value)
+{
+    const int32x4_t src_wave_buffer_resource =
+        make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T));
+
+    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+
+    thread_buffer<T, N> tmp =
+        amd_buffer_load_impl<T, N, coherence>(src_wave_buffer_resource, src_thread_addr_offset, 0);
+
+    if constexpr(oob_conditional_check)
+        return src_thread_element_valid ? tmp : thread_buffer<T, N>{customized_value};
+    else
+        return tmp;
+}
+
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool oob_conditional_check          = true,
+          bool pre_nop                        = false>
+CK_TILE_DEVICE void amd_buffer_load_raw(thread_buffer<T, N>& dst,
+                                        const T* p_src_wave,
+                                        index_t src_thread_element_offset,
+                                        index_t src_linear_element_offset,
+                                        index_t src_element_space_size,
+                                        index_t is_valid_element = 0,
+                                        bool_constant<pre_nop>   = {})
+{
+    const int32x4_t src_wave_buffer_resource =
+        make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T));
+
+    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+    index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
+
+    amd_buffer_load_raw_impl<T, N, coherence, oob_conditional_check, pre_nop>(
+        dst,
+        src_wave_buffer_resource,
+        src_thread_addr_offset,
+        0,
+        src_linear_addr_offset,
+        is_valid_element,
+        bool_constant<pre_nop>{});
+}
+
+// This version support buffer resource as input arg
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool oob_conditional_check          = true,
+          bool pre_nop                        = false>
+CK_TILE_DEVICE void amd_buffer_load_raw(thread_buffer<T, N>& dst,
+                                        const int32x4_t src_wave_buffer_resource,
+                                        index_t src_thread_element_offset,
+                                        index_t src_linear_element_offset,
+                                        index_t is_valid_element = 0,
+                                        bool_constant<pre_nop>   = {})
+{
+    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+    index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
+
+    amd_buffer_load_raw_impl<T, N, coherence, oob_conditional_check, pre_nop>(
+        dst,
+        src_wave_buffer_resource,
+        src_thread_addr_offset,
+        0,
+        src_linear_addr_offset,
+        is_valid_element,
+        bool_constant<pre_nop>{});
+}
+
+// unfortunately async copy can not make sure invalid data is zero inside LDS
+// ... unless people manually write zero to LDS at the proper address.
+// so not support invalid_element check for now.
+// buffer_load OOB still working.
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool pre_nop                        = false>
+CK_TILE_DEVICE void amd_async_buffer_load_with_oob_raw(T* smem,
+                                                       const T* p_src_wave,
+                                                       index_t src_thread_element_offset,
+                                                       index_t src_linear_element_offset,
+                                                       index_t src_element_space_size,
+                                                       bool_constant<pre_nop> = {})
+{
+    const int32x4_t src_wave_buffer_resource =
+        make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T));
+
+    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+    index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
+
+    amd_async_buffer_load_impl<T, N, coherence>(smem,
+                                                src_wave_buffer_resource,
+                                                src_thread_addr_offset,
+                                                0,
+                                                src_linear_addr_offset,
+                                                bool_constant<pre_nop>{});
+}
+
+// This version support buffer resource as input arg
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool pre_nop                        = false>
+CK_TILE_DEVICE void amd_async_buffer_load_with_oob_raw(T* smem,
+                                                       const int32x4_t src_wave_buffer_resource,
+                                                       index_t src_thread_element_offset,
+                                                       index_t src_linear_element_offset,
+                                                       bool_constant<pre_nop> = {})
+{
+    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+    index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
+
+    amd_async_buffer_load_impl<T, N, coherence>(smem,
+                                                src_wave_buffer_resource,
+                                                src_thread_addr_offset,
+                                                0,
+                                                src_linear_addr_offset,
+                                                bool_constant<pre_nop>{});
+}
+
+// This version support buffer resource as input arg
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool oob_conditional_check          = false>
+CK_TILE_DEVICE void amd_async_buffer_load_with_oob(CK_TILE_LDS_ADDR T* smem,
+                                                   const int32x4_t src_wave_buffer_resource,
+                                                   index_t src_thread_element_offset,
+                                                   index_t src_linear_element_offset,
+                                                   bool is_valid_element,
+                                                   bool_constant<oob_conditional_check> = {})
+{
+    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+    index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
+
+    amd_async_buffer_load<T, N, coherence>(smem,
+                                           src_wave_buffer_resource,
+                                           src_thread_addr_offset,
+                                           0,
+                                           src_linear_addr_offset,
+                                           is_valid_element,
+                                           bool_constant<oob_conditional_check>{});
+}
+
+// buffer_store requires:
+//   1) p_dst_wave must point to global memory
+//   2) p_dst_wave must be a wavewise pointer.
+// It is user's responsibility to make sure that is true.
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool oob_conditional_check          = true>
+CK_TILE_DEVICE void amd_buffer_store(const thread_buffer<T, N>& src_thread_data,
+                                     T* p_dst_wave,
+                                     const index_t dst_thread_element_offset,
+                                     const bool dst_thread_element_valid,
+                                     const index_t dst_element_space_size)
+{
+    const int32x4_t dst_wave_buffer_resource =
+        make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
+
+    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
+
+#if CK_TILE_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
+    uint32_t dst_addr_shift = [&]() {
+        if constexpr(oob_conditional_check)
+            return dst_thread_element_valid ? 0 : 0x80000000;
+        else
+            return 0;
+    }();
+    amd_buffer_store_impl<T, N, coherence>(
+        src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
+#else
+    if constexpr(oob_conditional_check)
+    {
+        if(dst_thread_element_valid)
+        {
+            amd_buffer_store_impl<T, N, coherence>(
+                src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
+        }
+    }
+    else
+    {
+        amd_buffer_store_impl<T, N, coherence>(
+            src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
+    }
+#endif
+}
+
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool oob_conditional_check          = true>
+CK_TILE_DEVICE void amd_buffer_store_raw(const thread_buffer<T, N>& src_thread_data,
+                                         T* p_dst_wave,
+                                         const index_t dst_thread_element_offset,
+                                         const index_t dst_linear_element_offset,
+                                         const bool dst_thread_element_valid,
+                                         const index_t dst_element_space_size)
+{
+    const int32x4_t dst_wave_buffer_resource =
+        make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
+
+    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
+    index_t dst_linear_addr_offset = dst_linear_element_offset * sizeof(T);
+
+    amd_buffer_store_raw_impl<T, N, coherence, oob_conditional_check>(src_thread_data,
+                                                                      dst_wave_buffer_resource,
+                                                                      dst_thread_addr_offset,
+                                                                      0,
+                                                                      dst_linear_addr_offset,
+                                                                      dst_thread_element_valid);
+}
+
+// buffer_atomic_add requires:
+//   1) p_dst_wave must point to global memory
+//   2) p_dst_wave must be a wavewise pointer.
+// It is user's responsibility to make sure that is true.
+template <typename T, index_t N>
+CK_TILE_DEVICE void amd_buffer_atomic_add(const thread_buffer<T, N>& src_thread_data,
+                                          T* p_dst_wave,
+                                          const index_t dst_thread_element_offset,
+                                          const bool dst_thread_element_valid,
+                                          const index_t dst_element_space_size)
+{
+    const int32x4_t dst_wave_buffer_resource =
+        make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
+
+    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
+
+#if CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
+    uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
+
+    amd_buffer_atomic_add_impl<T, N>(
+        src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
+#else
+    if(dst_thread_element_valid)
+    {
+        amd_buffer_atomic_add_impl<T, N>(
+            src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
+    }
+#endif
+}
+
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool oob_conditional_check          = true,
+          bool pre_nop                        = false>
+CK_TILE_DEVICE void amd_buffer_atomic_add_raw(const thread_buffer<T, N>& src_thread_data,
+                                              T* p_dst_wave,
+                                              const index_t dst_thread_element_offset,
+                                              const index_t dst_linear_element_offset,
+                                              const bool dst_thread_element_valid,
+                                              const index_t dst_element_space_size,
+                                              bool_constant<pre_nop> = {})
+{
+    const int32x4_t dst_wave_buffer_resource =
+        make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
+
+    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
+    index_t dst_linear_addr_offset = dst_linear_element_offset * sizeof(T);
+
+    if constexpr(oob_conditional_check)
+    {
+        buffer_atomic_add_if<T, N, pre_nop>{}(src_thread_data,
+                                              dst_wave_buffer_resource,
+                                              dst_thread_addr_offset,
+                                              0,
+                                              dst_linear_addr_offset,
+                                              dst_thread_element_valid);
+    }
+    else
+    {
+        buffer_atomic_add<T, N, pre_nop>{}(src_thread_data,
+                                           dst_wave_buffer_resource,
+                                           dst_thread_addr_offset,
+                                           0,
+                                           dst_linear_addr_offset,
+                                           1);
+    }
+}
+
+// buffer_atomic_max requires:
+//   1) p_dst_wave must point to global memory
+//   2) p_dst_wave must be a wavewise pointer.
+// It is user's responsibility to make sure that is true.
+template <typename T, index_t N>
+CK_TILE_DEVICE void amd_buffer_atomic_max(const thread_buffer<T, N>& src_thread_data,
+                                          T* p_dst_wave,
+                                          const index_t dst_thread_element_offset,
+                                          const bool dst_thread_element_valid,
+                                          const index_t dst_element_space_size)
+{
+    const int32x4_t dst_wave_buffer_resource =
+        make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
+
+    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
+
+#if CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK
+    uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
+
+    amd_buffer_atomic_max_impl<T, N>(
+        src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
+#else
+    if(dst_thread_element_valid)
+    {
+        amd_buffer_atomic_max_impl<T, N>(
+            src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
+    }
+#endif
+}
+
+template <typename T, index_t NumElemsPerThread>
+CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
+                                                  const index_t global_offset,
+                                                  T* lds_base_ptr,
+                                                  const index_t lds_offset,
+                                                  const bool is_valid,
+                                                  const index_t src_element_space_size)
+{
+    // Direct loads require that each thread reads and writes exactly a single DWORD.
+    constexpr auto dword_bytes      = 4;
+    constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
+    static_assert(bytes_per_thread == dword_bytes);
+
+    const uint32_t* global_ptr =
+        reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(global_base_ptr));
+    const int32x4_t src_resource =
+        make_wave_buffer_resource(global_ptr, src_element_space_size * sizeof(T));
+    const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000;
+
+#if CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
+    T* lds_ptr = lds_base_ptr + lds_offset;
+    auto const lds_ptr_sgpr =
+        __builtin_amdgcn_readfirstlane((reinterpret_cast<uintptr_t>(lds_ptr)));
+    asm volatile("s_mov_b32 m0, %0; \n\t"
+                 "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr),
+                 "v"(global_offset_bytes),
+                 "s"(src_resource)
+                 : "memory");
+#else
+    // LDS pointer must be attributed with the LDS address space.
+    __attribute__((address_space(3))) uint32_t* lds_ptr =
+        reinterpret_cast<__attribute__((address_space(3))) uint32_t*>(
+            reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
+
+    llvm_amdgcn_raw_buffer_load_lds(
+        src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0);
+#endif
+}
+
+} // namespace ck_tile
+
+#endif // CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN
diff --git a/include/ck_tile/core/arch/generic_memory_space_atomic.hpp b/include/ck_tile/core/arch/generic_memory_space_atomic.hpp
index e6fc08c545..07c6aa0baf 100644
--- a/include/ck_tile/core/arch/generic_memory_space_atomic.hpp
+++ b/include/ck_tile/core/arch/generic_memory_space_atomic.hpp
@@ -361,7 +361,7 @@ CK_TILE_DEVICE void atomic_add_g(T* p_dst, const thread_buffer<T, N>& x)
     {
         if constexpr(N == 2)
         {
-            atomic_add(c_style_pointer_cast<bf16x2_t*>(p_dst), bit_cast<bf16x2_t>(x));
+            atomic_add(c_style_pointer_cast<bf16x2_t*>(p_dst), x.template get_as<bf16x2_t>()[I0]);
         }
         else if constexpr(N == 4)
         {
diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp
index c761fcb8c3..414509e479 100644
--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -3,11 +3,10 @@
 
 #pragma once
 
-#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
-    defined(__gfx942__) || defined(__gfx950__)
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)
 #define __gfx9__
 #endif
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
+#if defined(__gfx942__) || defined(__gfx950__)
 #define __gfx94__
 #endif
 #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || \
@@ -16,7 +15,8 @@
 #define __gfx103__
 #endif
 #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || \
-    defined(__gfx1103__) || defined(__gfx11_generic__)
+    defined(__gfx1103__) || defined(__gfx1150__) || defined(__gfx1151__) || \
+    defined(__gfx1152__) || defined(__gfx11_generic__)
 #define __gfx11__
 #endif
 #if defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx12_generic__)
@@ -45,7 +45,8 @@
 
 // implementing the "memory address space" attribute
 // https://llvm.org/docs/AMDGPUUsage.html#amdgpu-address-spaces-table
-#ifdef __HIPCC_
+// WA for https://github.com/ROCm/composable_kernel/issues/1946
+#if 0
 #define CK_TILE_GENERIC_ADDR __attribute__((address_space(0)))
 #define CK_TILE_GLOBAL_ADDR __attribute__((address_space(1)))
 #define CK_TILE_LDS_ADDR __attribute__((address_space(3)))
@@ -246,3 +247,15 @@
 #else // for GPU code
 #define CK_TILE_USE_OCP_FP8 0
 #endif
+
+#ifndef CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN
+#if __clang_major__ == 20
+#define CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN 1
+#else
+#define CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN 0
+#endif
+#endif
+
+#ifndef CK_TILE_WA_ISSUE_2028
+#define CK_TILE_WA_ISSUE_2028 1
+#endif
diff --git a/include/ck_tile/core/container/tuple.hpp b/include/ck_tile/core/container/tuple.hpp
index fd02177e25..3700d348e7 100644
--- a/include/ck_tile/core/container/tuple.hpp
+++ b/include/ck_tile/core/container/tuple.hpp
@@ -396,11 +396,16 @@ struct tuple_array_impl<T, 1>
 };
 } // namespace impl
 
+template <typename F, index_t... ids>
+CK_TILE_HOST_DEVICE constexpr auto generate_tuple_for(F&& f, sequence<ids...>)
+{
+    return make_tuple(f(number<ids>{})...);
+}
+
 template <typename F, index_t N>
 CK_TILE_HOST_DEVICE constexpr auto generate_tuple(F&& f, number<N>)
 {
-    return unpack([&f](auto&&... is) { return make_tuple(f(is)...); },
-                  typename arithmetic_sequence_gen<0, N, 1>::type{});
+    return generate_tuple_for(f, make_index_sequence<N>{});
 }
 
 template <typename F, index_t N>
diff --git a/include/ck_tile/core/numeric/float8.hpp b/include/ck_tile/core/numeric/float8.hpp
index facc3e45ee..b5da468319 100644
--- a/include/ck_tile/core/numeric/float8.hpp
+++ b/include/ck_tile/core/numeric/float8.hpp
@@ -523,14 +523,14 @@ CK_TILE_HOST_DEVICE DstT run_cast_from_f8(SrcT x)
     int exponent                = (x & 0x7F) >> SrcT_mant;
     if constexpr(is_fnuz)
     {
-        if(x == 0x80)
+        if((x & 0xff) == 0x80)
         {
             return fNaN;
         }
     }
     else
     {
-        if(x == 0x80)
+        if(x == SrcT(0x80))
         {
             return fNeg0;
         }
diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp
index c2a093f1ab..bdcfbdd920 100644
--- a/include/ck_tile/core/tensor/buffer_view.hpp
+++ b/include/ck_tile/core/tensor/buffer_view.hpp
@@ -5,7 +5,11 @@
 
 #include "ck_tile/core/config.hpp"
 #include "ck_tile/core/arch/arch.hpp"
+#if __clang_major__ == 20
+#include "ck_tile/core/arch/amd_buffer_addressing_builtins.hpp"
+#else
 #include "ck_tile/core/arch/amd_buffer_addressing.hpp"
+#endif
 #include "ck_tile/core/arch/generic_memory_space_atomic.hpp"
 #include "ck_tile/core/container/array.hpp"
 #include "ck_tile/core/numeric/integer.hpp"
diff --git a/include/ck_tile/core/tensor/tensor_view.hpp b/include/ck_tile/core/tensor/tensor_view.hpp
index 336793c5b1..32de227b52 100644
--- a/include/ck_tile/core/tensor/tensor_view.hpp
+++ b/include/ck_tile/core/tensor/tensor_view.hpp
@@ -411,18 +411,21 @@ struct null_tensor_view
 };
 
 template <address_space_enum BufferAddressSpace = address_space_enum::generic,
+          amd_buffer_coherence_enum Coherence   = amd_buffer_coherence_enum::coherence_default,
           typename DataType,
           typename... Ts>
 CK_TILE_HOST_DEVICE constexpr auto make_tensor_view(DataType* p,
                                                     const tensor_descriptor<Ts...>& desc)
 {
-    auto buffer_view = make_buffer_view<BufferAddressSpace>(p, desc.get_element_space_size());
+    auto buffer_view =
+        make_buffer_view<BufferAddressSpace, Coherence>(p, desc.get_element_space_size());
 
     return tensor_view<decltype(buffer_view), decltype(desc)>{buffer_view, desc};
 }
 
 template <address_space_enum BufferAddressSpace = address_space_enum::generic,
           memory_operation_enum DstInMemOp      = memory_operation_enum::set,
+          amd_buffer_coherence_enum Coherence   = amd_buffer_coherence_enum::coherence_default,
           typename DataType,
           typename... Lengths,
           typename... Strides,
@@ -441,12 +444,14 @@ make_naive_tensor_view(DataType* p,
                                              number<GuaranteedLastDimensionVectorLength>{},
                                              number<GuaranteedLastDimensionVectorStride>{});
 
-    auto buffer_view = make_buffer_view<BufferAddressSpace>(p, desc.get_element_space_size());
+    auto buffer_view =
+        make_buffer_view<BufferAddressSpace, Coherence>(p, desc.get_element_space_size());
 
     return tensor_view<decltype(buffer_view), decltype(desc), DstInMemOp>{buffer_view, desc};
 }
 
 template <address_space_enum BufferAddressSpace = address_space_enum::generic,
+          amd_buffer_coherence_enum Coherence   = amd_buffer_coherence_enum::coherence_default,
           typename DataType,
           typename... Lengths,
           index_t GuaranteedLastDimensionVectorLength = -1>
@@ -458,7 +463,8 @@ make_naive_tensor_view_packed(DataType* p,
     auto desc =
         make_naive_tensor_descriptor_packed(lengths, number<GuaranteedLastDimensionVectorLength>{});
 
-    auto buffer_view = make_buffer_view<BufferAddressSpace>(p, desc.get_element_space_size());
+    auto buffer_view =
+        make_buffer_view<BufferAddressSpace, Coherence>(p, desc.get_element_space_size());
 
     return tensor_view<decltype(buffer_view), decltype(desc)>{buffer_view, desc};
 }
diff --git a/include/ck_tile/core/tensor/transpose_tile.hpp b/include/ck_tile/core/tensor/transpose_tile.hpp
index f34efe5c2f..5b65b79c1a 100644
--- a/include/ck_tile/core/tensor/transpose_tile.hpp
+++ b/include/ck_tile/core/tensor/transpose_tile.hpp
@@ -83,9 +83,6 @@ CK_TILE_DEVICE void transpose_tile2d_impl_in_thread(OutTensor& out_tensor,
     constexpr index_t num_vec_in  = vec_length_out;
     constexpr index_t num_vec_out = vec_length_in;
 
-    using InVec  = array<DataType, vec_length_in>;
-    using OutVec = array<DataType, vec_length_out>;
-
     // SFC
     constexpr auto scalars_per_access_arr = generate_array(
         [&](auto i) { return (i == y_dim_vec_in or i == y_dim_vec_out) ? y_lengths[i] : 1; },
@@ -101,51 +98,84 @@ CK_TILE_DEVICE void transpose_tile2d_impl_in_thread(OutTensor& out_tensor,
 
     static_assert(num_access > 0, "wrong! num_access should be larger than 0");
 
-    // in/out vectors to be transposed
-    thread_buffer<InVec, num_vec_in> in_vectors;
-    thread_buffer<OutVec, num_vec_out> out_vectors;
+    if constexpr(num_vec_in == 1 || num_vec_out == 1)
+    {
+        // loop over SFC
+        static_for<0, num_access, 1>{}([&](auto iAccess) {
+            // data index [y0, y1, ...] in the order of input tensor
+            constexpr auto idx_y = SFC_Y::get_index(iAccess);
 
-    // loop over SFC and do transpose
-    static_for<0, num_access, 1>{}([&](auto iAccess) {
-        // data index [y0, y1, ...] in the order of input tensor
-        constexpr auto idx_y_start = SFC_Y::get_index(iAccess);
+            constexpr index_t in_offset  = y_in_desc.calculate_offset(idx_y);
+            constexpr index_t out_offset = y_out_desc.calculate_offset(idx_y);
 
-        // get input vectors
-        static_for<0, num_vec_in, 1>{}([&](auto i) {
-            constexpr auto idx_y_in = generate_tuple(
-                [&](auto ii) {
-                    return ii == y_dim_vec_out ? idx_y_start[ii] + i : idx_y_start[ii];
-                },
-                number<NDimY>{});
-
-            constexpr index_t in_offset = y_in_desc.calculate_offset(idx_y_in);
-            static_assert(in_offset % vec_length_in == 0);
-
-            in_vectors(i).template get_as<InVec>()(I0) =
-                in_tensor.get_thread_buffer()
-                    .template get_as<InVec>()[number<in_offset / vec_length_in>{}];
+            if constexpr(vec_length_in == 1)
+            {
+                out_tensor.get_thread_buffer()[number<out_offset>{}] =
+                    in_tensor.get_thread_buffer()[number<in_offset>{}];
+            }
+            else
+            {
+                using Vec = array<DataType, vec_length_in>;
+                out_tensor.get_thread_buffer().template get_as<Vec>(
+                    number<out_offset / vec_length_in>{}) =
+                    in_tensor.get_thread_buffer().template get_as<Vec>(
+                        number<in_offset / vec_length_in>{});
+            }
         });
+    }
+    else
+    {
+        using InVec  = array<DataType, vec_length_in>;
+        using OutVec = array<DataType, vec_length_out>;
 
-        // transpose
-        transpose_vectors<DataType, num_vec_in, num_vec_out>{}(in_vectors, out_vectors);
+        // in/out vectors to be transposed
+        thread_buffer<InVec, num_vec_in> in_vectors;
+        thread_buffer<OutVec, num_vec_out> out_vectors;
 
-        // set output vectors
-        static_for<0, num_vec_out, 1>{}([&](auto i) {
-            constexpr auto idx_y_out_tmp = generate_array(
-                [&](auto ii) { return ii == y_dim_vec_in ? idx_y_start[ii] + i : idx_y_start[ii]; },
-                number<NDimY>{});
+        // loop over SFC and do transpose
+        static_for<0, num_access, 1>{}([&](auto iAccess) {
+            // data index [y0, y1, ...] in the order of input tensor
+            constexpr auto idx_y_start = SFC_Y::get_index(iAccess);
 
-            constexpr auto idx_y_out =
-                container_reorder_given_new2old(idx_y_out_tmp, y_dim_out_to_in);
+            // get input vectors
+            static_for<0, num_vec_in, 1>{}([&](auto i) {
+                constexpr auto idx_y_in = generate_tuple(
+                    [&](auto ii) {
+                        return ii == y_dim_vec_out ? idx_y_start[ii] + i : idx_y_start[ii];
+                    },
+                    number<NDimY>{});
 
-            constexpr index_t out_offset = y_out_desc.calculate_offset(idx_y_out);
-            static_assert(out_offset % vec_length_out == 0);
+                constexpr index_t in_offset = y_in_desc.calculate_offset(idx_y_in);
+                static_assert(in_offset % vec_length_in == 0);
 
-            out_tensor.get_thread_buffer().template set_as<OutVec>(
-                number<out_offset / vec_length_out>{},
-                out_vectors[i].template get_as<OutVec>()[I0]);
+                in_vectors(i).template get_as<InVec>()(I0) =
+                    in_tensor.get_thread_buffer()
+                        .template get_as<InVec>()[number<in_offset / vec_length_in>{}];
+            });
+
+            // transpose
+            transpose_vectors<DataType, num_vec_in, num_vec_out>{}(in_vectors, out_vectors);
+
+            // set output vectors
+            static_for<0, num_vec_out, 1>{}([&](auto i) {
+                constexpr auto idx_y_out_tmp = generate_array(
+                    [&](auto ii) {
+                        return ii == y_dim_vec_in ? idx_y_start[ii] + i : idx_y_start[ii];
+                    },
+                    number<NDimY>{});
+
+                constexpr auto idx_y_out =
+                    container_reorder_given_new2old(idx_y_out_tmp, y_dim_out_to_in);
+
+                constexpr index_t out_offset = y_out_desc.calculate_offset(idx_y_out);
+                static_assert(out_offset % vec_length_out == 0);
+
+                out_tensor.get_thread_buffer().template set_as<OutVec>(
+                    number<out_offset / vec_length_out>{},
+                    out_vectors[i].template get_as<OutVec>()[I0]);
+            });
         });
-    });
+    }
 }
 
 } // namespace detail
diff --git a/include/ck_tile/core/utility/env.hpp b/include/ck_tile/core/utility/env.hpp
new file mode 100644
index 0000000000..9b148b3e0b
--- /dev/null
+++ b/include/ck_tile/core/utility/env.hpp
@@ -0,0 +1,208 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <string>
+
+namespace ck_tile {
+
+template <typename... Args>
+void CK_TILE_ERROR(Args&&... args) noexcept
+{
+    std::ostringstream oss;
+    (oss << ... << args);
+    std::cerr << "[ERROR] " << oss.str() << std::endl;
+}
+
+namespace internal {
+
+template <size_t N>
+bool is_any_of(const char* const (&names)[N], const std::string& str)
+{
+    return std::any_of(std::begin(names), std::end(names), [&](const char* inner_str) {
+        return str == inner_str;
+    });
+};
+
+template <typename T>
+struct ParseEnvVal
+{
+};
+template <>
+struct ParseEnvVal<bool>
+{
+    static bool parse_env_var_value(const char* vp)
+    {
+        std::string value_env_str{vp};
+
+        for(auto& c : value_env_str)
+        {
+            if(std::isalpha(c) != 0)
+            {
+                c = std::tolower(static_cast<unsigned char>(c));
+            }
+        }
+
+        if(is_any_of(enabled_names, value_env_str))
+        {
+            return true;
+        }
+        else if(is_any_of(disabled_names, value_env_str))
+        {
+            return false;
+        }
+        else
+        {
+            throw std::runtime_error("Invalid value for env variable");
+        }
+
+        return false;
+    }
+
+    private:
+    static constexpr const char* enabled_names[]  = {"enable", "enabled", "1", "yes", "on", "true"};
+    static constexpr const char* disabled_names[] = {
+        "disable", "disabled", "0", "no", "off", "false"};
+};
+
+// Supports hexadecimals (with leading "0x"), octals (if prefix is "0") and decimals (default).
+// Returns 0 if environment variable is in wrong format (strtoull fails to parse the string).
+template <>
+struct ParseEnvVal<uint64_t>
+{
+    static uint64_t parse_env_var_value(const char* vp) { return std::strtoull(vp, nullptr, 0); }
+};
+
+template <>
+struct ParseEnvVal<std::string>
+{
+    static std::string parse_env_var_value(const char* vp) { return std::string{vp}; }
+};
+
+template <typename T>
+struct EnvVar
+{
+    private:
+    T value{};
+    bool is_unset = true;
+
+    public:
+    const T& GetValue() const { return value; }
+
+    bool IsUnset() const { return is_unset; }
+
+    void Unset() { is_unset = true; }
+
+    void UpdateValue(const T& val)
+    {
+        is_unset = false;
+        value    = val;
+    }
+
+    explicit EnvVar(const char* const name, const T& def_val)
+    {
+        // NOLINTNEXTLINE (concurrency-mt-unsafe)
+        const char* vp = std::getenv(name);
+        if(vp != nullptr) // a value was provided
+        {
+            is_unset = false;
+            value    = ParseEnvVal<T>::parse_env_var_value(vp);
+        }
+        else // no value provided, use default value
+        {
+            value = def_val;
+        }
+    }
+};
+} // end namespace internal
+
+// Static inside function hides the variable and provides
+// thread-safety/locking
+// Used in global namespace
+#define CK_TILE_DECLARE_ENV_VAR(name, type, default_val)                            \
+    namespace ck_tile::env {                                                        \
+    struct name                                                                     \
+    {                                                                               \
+        static_assert(std::is_same_v<name, ::ck_tile::env::name>,                   \
+                      "CK_TILE_DECLARE_ENV* must be used in the global namespace"); \
+        using value_type = type;                                                    \
+        static ck_tile::internal::EnvVar<type>& Ref()                               \
+        {                                                                           \
+            static ck_tile::internal::EnvVar<type> var{#name, default_val};         \
+            return var;                                                             \
+        }                                                                           \
+    };                                                                              \
+    }
+
+#define CK_TILE_DECLARE_ENV_VAR_BOOL(name) CK_TILE_DECLARE_ENV_VAR(name, bool, false)
+
+#define CK_TILE_DECLARE_ENV_VAR_UINT64(name) CK_TILE_DECLARE_ENV_VAR(name, uint64_t, 0)
+
+#define CK_TILE_DECLARE_ENV_VAR_STR(name) CK_TILE_DECLARE_ENV_VAR(name, std::string, "")
+
+#define CK_TILE_ENV(name) \
+    ck_tile::env::name {}
+
+template <class EnvVar>
+inline const std::string& EnvGetString(EnvVar)
+{
+    static_assert(std::is_same_v<typename EnvVar::value_type, std::string>);
+    return EnvVar::Ref().GetValue();
+}
+
+template <class EnvVar>
+inline bool EnvIsEnabled(EnvVar)
+{
+    static_assert(std::is_same_v<typename EnvVar::value_type, bool>);
+    return !EnvVar::Ref().IsUnset() && EnvVar::Ref().GetValue();
+}
+
+template <class EnvVar>
+inline bool EnvIsDisabled(EnvVar)
+{
+    static_assert(std::is_same_v<typename EnvVar::value_type, bool>);
+    return !EnvVar::Ref().IsUnset() && !EnvVar::Ref().GetValue();
+}
+
+template <class EnvVar>
+inline uint64_t EnvValue(EnvVar)
+{
+    static_assert(std::is_same_v<typename EnvVar::value_type, uint64_t>);
+    return EnvVar::Ref().GetValue();
+}
+
+template <class EnvVar>
+inline bool EnvIsUnset(EnvVar)
+{
+    return EnvVar::Ref().IsUnset();
+}
+
+template <class EnvVar>
+void EnvUnset(EnvVar)
+{
+    EnvVar::Ref().Unset();
+}
+
+/// Updates the cached value of an environment variable
+template <typename EnvVar, typename ValueType>
+void UpdateEnvVar(EnvVar, const ValueType& val)
+{
+    static_assert(std::is_same_v<typename EnvVar::value_type, ValueType>);
+    EnvVar::Ref().UpdateValue(val);
+}
+
+template <typename EnvVar>
+void UpdateEnvVar(EnvVar, const std::string_view& val)
+{
+    EnvVar::Ref().UpdateValue(
+        ck_tile::internal::ParseEnvVal<typename EnvVar::value_type>::parse_env_var_value(
+            val.data()));
+}
+
+} // namespace ck_tile
+
+// environment variable to enable logging:
+// export CK_TILE_LOGGING=ON or CK_TILE_LOGGING=1 or CK_TILE_LOGGING=ENABLED
+CK_TILE_DECLARE_ENV_VAR_BOOL(CK_TILE_LOGGING)
diff --git a/include/ck_tile/core/utility/functional.hpp b/include/ck_tile/core/utility/functional.hpp
index 2cdce94063..fd0252d3ca 100644
--- a/include/ck_tile/core/utility/functional.hpp
+++ b/include/ck_tile/core/utility/functional.hpp
@@ -58,6 +58,30 @@ struct static_for
     }
 };
 
+namespace detail {
+
+template <typename T, T... Is>
+struct applier
+{
+    template <typename F>
+    CK_TILE_HOST_DEVICE constexpr void operator()(F f) const
+    {
+        // tweak -fbracket-depth if compilation fails. Clang default limit is 256
+        (f(number<Is>{}), ...);
+    }
+};
+
+template <int32_t Size> // == sizeof...(Is)
+using make_applier = __make_integer_seq<applier, index_t, Size>;
+
+} // namespace detail
+
+template <index_t N>
+struct static_for<0, N, 1> : detail::make_applier<N>
+{
+    using detail::make_applier<N>::operator();
+};
+
 struct identity
 {
     template <typename T>
diff --git a/include/ck_tile/core/utility/magic_div.hpp b/include/ck_tile/core/utility/magic_div.hpp
index fd9c733c52..1715983c09 100644
--- a/include/ck_tile/core/utility/magic_div.hpp
+++ b/include/ck_tile/core/utility/magic_div.hpp
@@ -38,7 +38,7 @@ struct magic_division32_bit_range
             shift_u32++;
         };
 
-        uint64_t tmp_u64        = ((1UL << shift_u32) - divisor) << 32;
+        uint64_t tmp_u64        = static_cast<uint64_t>((1UL << shift_u32) - divisor) << 32;
         uint32_t multiplier_u32 = tmp_u64 / divisor + 1;
 
         return make_tuple(multiplier_u32, shift_u32);
diff --git a/include/ck_tile/host/fill.hpp b/include/ck_tile/host/fill.hpp
index 006026470b..3f64eb28cd 100644
--- a/include/ck_tile/host/fill.hpp
+++ b/include/ck_tile/host/fill.hpp
@@ -280,7 +280,7 @@ struct FillMonotonicSeq
     template <typename ForwardIter>
     void operator()(ForwardIter first, ForwardIter last) const
     {
-        std::generate(first, last, [=, n = init_value_]() mutable {
+        std::generate(first, last, [=, *this, n = init_value_]() mutable {
             auto tmp = n;
             if constexpr(std::is_same_v<decltype(tmp), pk_int4_t>)
             {
@@ -315,7 +315,7 @@ struct FillStepRange
     template <typename ForwardIter>
     void operator()(ForwardIter first, ForwardIter last) const
     {
-        std::generate(first, last, [=, n = start_value_]() mutable {
+        std::generate(first, last, [=, *this, n = start_value_]() mutable {
             auto tmp = n;
             n += step_;
             if constexpr(IsAscending)
@@ -364,6 +364,49 @@ struct FillConstant
     }
 };
 
+//----------------------------------------------------------------------------------------------
+/// @brief      Transforms given input to fit 2:4 structured sparsity pattern so
+///             every subgroup of 4 elements contain at most 2 non-zero elements
+template <typename T>
+struct AdjustToStructuredSparsity
+{
+    size_t start{0};
+    // masks represent all valid 2:4 structured sparsity permutations
+    // clang-format off
+    static constexpr int32_t masks[] = {0, 0, 1, 1,
+                                        0, 1, 0, 1,
+                                        0, 1, 1, 0,
+                                        1, 0, 0, 1,
+                                        1, 0, 1, 0,
+                                        1, 1, 0, 0,
+                                        0, 0, 0, 1,
+                                        0, 0, 1, 0,
+                                        0, 1, 0, 0,
+                                        1, 0, 0, 0};
+    // clang-format on
+
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        std::transform(first, last, first, [=, *this, index = start](T val) mutable {
+            auto tmp = val * masks[index % (sizeof(masks) / sizeof(int32_t))];
+            index += 1;
+
+            return type_convert<T>(tmp);
+        });
+    }
+
+    template <typename ForwardRange>
+    auto operator()(ForwardRange&& range) const
+        -> std::void_t<decltype(std::declval<const AdjustToStructuredSparsity&>()(
+            std::begin(std::forward<ForwardRange>(range)),
+            std::end(std::forward<ForwardRange>(range))))>
+    {
+        (*this)(std::begin(std::forward<ForwardRange>(range)),
+                std::end(std::forward<ForwardRange>(range)));
+    }
+};
+
 template <typename T, bool UseCos = true, bool UseAbs = false>
 struct FillTrigValue
 {
diff --git a/include/ck_tile/host/kernel_launch.hpp b/include/ck_tile/host/kernel_launch.hpp
index 5c7bf12bfc..d159787387 100644
--- a/include/ck_tile/host/kernel_launch.hpp
+++ b/include/ck_tile/host/kernel_launch.hpp
@@ -41,6 +41,16 @@ make_kernel(KernelImpl /*f*/, dim3 grid_dim, dim3 block_dim, std::size_t lds_byt
     };
 }
 
+template <typename... Callables>
+CK_TILE_HOST void launch_and_check(const stream_config& sc, Callables&&... callables)
+{
+    // abort the sequence in case of intermediate error
+    if(!((static_cast<void>(callables(sc)), hipPeekAtLastError() == hipSuccess) && ...))
+    {
+        HIP_CHECK_ERROR(hipGetLastError());
+    }
+}
+
 // clang-format off
 /*
  * launch_kernel()
@@ -69,38 +79,39 @@ make_kernel(KernelImpl /*f*/, dim3 grid_dim, dim3 block_dim, std::size_t lds_byt
  **/
 // clang-format on
 template <typename... Callables>
-CK_TILE_HOST float launch_kernel(const stream_config& s, Callables... callables)
+CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callables)
 {
-    // clang-format off
-    if(!s.time_kernel_) {
-        (callables(s),...); HIP_CHECK_ERROR(hipGetLastError());
+    if(!s.time_kernel_)
+    {
+        launch_and_check(s, std::forward<Callables>(callables)...);
         return 0;
     }
-    if(s.is_gpu_timer_) {
-        gpu_timer timer {};
 
+    auto time_launches = [&](auto timer) {
         // warmup
-        for(int i = 0; i < s.cold_niters_; i++) { (callables(s),...); } HIP_CHECK_ERROR(hipGetLastError());
+        for(int i = 0; i < s.cold_niters_; i++)
+        {
+            launch_and_check(s, std::forward<Callables>(callables)...);
+        }
 
         timer.start(s.stream_id_);
-        for(int i = 0; i < s.nrepeat_; i++) { (callables(s),...); } HIP_CHECK_ERROR(hipGetLastError());
+        for(int i = 0; i < s.nrepeat_; i++)
+        {
+            launch_and_check(s, std::forward<Callables>(callables)...);
+        }
         timer.stop(s.stream_id_);
 
         return timer.duration() / s.nrepeat_;
+    };
+
+    if(s.is_gpu_timer_)
+    {
+        return time_launches(gpu_timer{});
     }
-    else {
-        cpu_timer timer {};
-
-        // warmup
-        for(int i = 0; i < s.cold_niters_; i++) { (callables(s),...); } HIP_CHECK_ERROR(hipGetLastError());
-
-        timer.start(s.stream_id_);
-        for(int i = 0; i < s.nrepeat_; i++) { (callables(s),...); } HIP_CHECK_ERROR(hipGetLastError());
-        timer.stop(s.stream_id_);
-
-        return timer.duration() / s.nrepeat_;
+    else
+    {
+        return time_launches(cpu_timer{});
     }
-    // clang-format on
 }
 
 } // namespace ck_tile
diff --git a/include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp b/include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp
index 475d7014dd..070168b51d 100644
--- a/include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp
+++ b/include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -35,11 +35,13 @@ template <typename XDataType,
           typename ComputeDataType,
           typename YDataType,
           typename InvRmsDataType,
+          typename UnquantYDataType,
           typename Epilogue = reference_rmsnorm2d_default_epilogue>
 void reference_rmsnorm2d_fwd(const HostTensor<XDataType>& x_m_n,
                              const HostTensor<GammaDataType>& gamma_n,
                              HostTensor<YDataType>& y_m_n,
                              HostTensor<InvRmsDataType>& invRms_m,
+                             HostTensor<UnquantYDataType>& unquant_y_m_n,
                              ComputeDataType epsilon,
                              Epilogue epilogue_functor = {})
 {
@@ -69,7 +71,14 @@ void reference_rmsnorm2d_fwd(const HostTensor<XDataType>& x_m_n,
             acc(m, n)             = x * divisor * gamma;
         }
 
-        epilogue_functor(m, y_m_n, acc);
+        if constexpr(!std::is_same_v<UnquantYDataType, ck_tile::null_type>)
+        {
+            epilogue_functor(m, unquant_y_m_n, y_m_n, acc);
+        }
+        else
+        {
+            epilogue_functor(m, y_m_n, acc);
+        }
     };
 
     make_ParallelTensorFunctor(rmsnorm2d_fwd_func, invRms_m.mDesc.get_lengths()[0])(
diff --git a/include/ck_tile/ops/common/utils.hpp b/include/ck_tile/ops/common/utils.hpp
old mode 100644
new mode 100755
index 8592f93e0f..b422a0a896
--- a/include/ck_tile/ops/common/utils.hpp
+++ b/include/ck_tile/ops/common/utils.hpp
@@ -18,6 +18,7 @@ template <> struct typeToStr<bf16_t> { static constexpr const char * name = "bf1
 template <> struct typeToStr<fp8_t> { static constexpr const char * name = "fp8"; };
 template <> struct typeToStr<bf8_t> { static constexpr const char * name = "bf8"; };
 template <> struct typeToStr<int8_t> { static constexpr const char * name = "int8"; };
+template <> struct typeToStr<pk_int4_t> { static constexpr const char * name = "pk_int4"; };
 // clang-format on
 
 template <typename ADataType_, typename BDataType_>
diff --git a/include/ck_tile/ops/epilogue.hpp b/include/ck_tile/ops/epilogue.hpp
index 9d2ed407c9..6cc0fa8540 100644
--- a/include/ck_tile/ops/epilogue.hpp
+++ b/include/ck_tile/ops/epilogue.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck_tile/ops/epilogue/cshuffle_epilogue.hpp"
+#include "ck_tile/ops/epilogue/default_2d_and_dynamic_quant_epilogue.hpp"
 #include "ck_tile/ops/epilogue/default_2d_epilogue.hpp"
 #include "ck_tile/ops/epilogue/dynamic_quant_epilogue.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
index 155dbad6e3..225997439e 100644
--- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
@@ -9,7 +9,9 @@
 
 namespace ck_tile {
 
-template <typename AccDataType_,
+template <typename ADataType_,
+          typename BDataType_,
+          typename AccDataType_,
           typename ODataType_,
           typename CLayout_,
           index_t kBlockSize_,
@@ -23,6 +25,8 @@ template <typename AccDataType_,
           bool isCTransposed_>
 struct CShuffleEpilogueProblem
 {
+    using ADataType                        = remove_cvref_t<ADataType_>;
+    using BDataType                        = remove_cvref_t<BDataType_>;
     using AccDataType                      = remove_cvref_t<AccDataType_>;
     using ODataType                        = remove_cvref_t<ODataType_>;
     using CLayout                          = remove_cvref_t<CLayout_>;
@@ -40,9 +44,14 @@ struct CShuffleEpilogueProblem
 template <typename Problem_, typename Policy_ = void>
 struct CShuffleEpilogue
 {
-    using Problem                           = remove_cvref_t<Problem_>;
-    using AccDataType                       = remove_cvref_t<typename Problem::AccDataType>;
-    using ODataType                         = remove_cvref_t<typename Problem::ODataType>;
+    using Problem     = remove_cvref_t<Problem_>;
+    using ADataType   = remove_cvref_t<typename Problem::ADataType>;
+    using BDataType   = remove_cvref_t<typename Problem::BDataType>;
+    using AccDataType = remove_cvref_t<typename Problem::AccDataType>;
+    using ODataType   = remove_cvref_t<typename Problem::ODataType>;
+    // Used for weight-only quantization kernel, B would be dequantized to the same data type as A
+    using BTypeToUse =
+        std::conditional_t<std::is_same_v<BDataType, pk_int4_t>, ADataType, BDataType>;
     using CLayout                           = remove_cvref_t<typename Problem::CLayout>;
     static constexpr index_t kBlockSize     = Problem::kBlockSize;
     static constexpr index_t kMPerBlock     = Problem::kMPerBlock;
@@ -56,8 +65,8 @@ struct CShuffleEpilogue
     static constexpr index_t kMPerIteration = kMPerXdl * kMWave;
     static constexpr index_t kNPerIteration = kNPerXdl * kNWave;
 
-    using WG = WarpGemmMfmaDispatcher<ODataType,
-                                      ODataType,
+    using WG = WarpGemmMfmaDispatcher<ADataType,
+                                      BTypeToUse,
                                       AccDataType,
                                       kMPerXdl,
                                       kNPerXdl,
@@ -77,7 +86,6 @@ struct CShuffleEpilogue
      *
      * @return The vector store size for C tensor.
      */
-    template <typename ODataType>
     CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeC()
     {
         constexpr index_t MaxVectorStoreSize = 16;
@@ -143,7 +151,7 @@ struct CShuffleEpilogue
             TileDistributionEncodingPattern2D<kBlockSize,
                                               kMPerIteration,
                                               kNPerIteration,
-                                              GetVectorSizeC<ODataType>(),
+                                              GetVectorSizeC(),
                                               tile_distribution_pattern::thread_raked>;
         constexpr auto dram_tile_distribution = TileEncodingPattern::Make2DStaticTileDistribution();
 
diff --git a/include/ck_tile/ops/epilogue/default_2d_and_dynamic_quant_epilogue.hpp b/include/ck_tile/ops/epilogue/default_2d_and_dynamic_quant_epilogue.hpp
new file mode 100644
index 0000000000..6c5a2ac149
--- /dev/null
+++ b/include/ck_tile/ops/epilogue/default_2d_and_dynamic_quant_epilogue.hpp
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "default_2d_epilogue.hpp"
+#include "dynamic_quant_epilogue.hpp"
+
+namespace ck_tile {
+
+// User can reuse DynamicQuantEpilogueTraits with this epilogue
+template <bool kPadM_,
+          bool kPadN_,
+          bool UseSmoothInputScale_,
+          bool UseRawStore_ = true,
+          bool UseMax3_     = false>
+using Default2DAndDynamicQuantEpilogueTraits =
+    DynamicQuantEpilogueTraits<kPadM_, kPadN_, UseSmoothInputScale_, UseRawStore_, UseMax3_>;
+
+// This epilogue just store out a M*N matrix, row major
+template <typename AccDataType_,
+          typename SmoothScaleDataType_,
+          typename YScaleDataType_,
+          typename ODataType_,
+          typename UnquantYDataType_,
+          typename BlockShape_,
+          typename Traits_>
+struct Default2DAndDynamicQuantEpilogueProblem
+{
+    using AccDataType         = remove_cvref_t<AccDataType_>;
+    using SmoothScaleDataType = remove_cvref_t<SmoothScaleDataType_>;
+    using YScaleDataType      = remove_cvref_t<YScaleDataType_>;
+    using ODataType           = remove_cvref_t<ODataType_>;
+    using UnquantYDataType    = remove_cvref_t<UnquantYDataType_>;
+    using BlockShape          = remove_cvref_t<BlockShape_>; // can consum generic 2d shape
+    using Traits              = remove_cvref_t<Traits_>;
+};
+
+template <typename Problem_, typename Policy_ = void>
+struct Default2DAndDynamicQuantEpilogue
+{
+    using Problem          = remove_cvref_t<Problem_>;
+    using AccDataType      = remove_cvref_t<typename Problem::AccDataType>;
+    using UnquantYDataType = remove_cvref_t<typename Problem::UnquantYDataType>;
+
+    static constexpr bool kPadM       = Problem::Traits::kPadM;
+    static constexpr bool kPadN       = Problem::Traits::kPadN;
+    static constexpr bool UseRawStore = Problem::Traits::UseRawStore;
+
+    using Default2DProblem =
+        Default2DEpilogueProblem<AccDataType, UnquantYDataType, kPadM, kPadN, UseRawStore>;
+    using Default2D    = Default2DEpilogue<Default2DProblem>;
+    using DynamicQuant = DynamicQuantEpilogue<Problem>;
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return max(Default2D::GetSmemSize(), DynamicQuant::GetSmemSize());
+    }
+
+    template <typename ODramWindowTmpD,
+              typename ODramWindowTmpQ,
+              typename SmoothScaleWindow,
+              typename YScaleWindow,
+              typename OAccTile>
+    CK_TILE_DEVICE auto operator()(ODramWindowTmpD& o_direct_dram_window_tmp,
+                                   ODramWindowTmpQ& o_quant_dram_window_tmp,
+                                   const SmoothScaleWindow& sm_scale_window_,
+                                   YScaleWindow& y_scale_window,
+                                   const OAccTile& o_acc_tile,
+                                   void* smem)
+    {
+        Default2D{}(o_direct_dram_window_tmp, o_acc_tile, smem);
+        DynamicQuant{}(o_quant_dram_window_tmp, sm_scale_window_, y_scale_window, o_acc_tile, smem);
+    }
+
+    template <typename ODramWindowTmpD,
+              typename ODramWindowTmpQ,
+              typename YScaleWindow,
+              typename OAccTile>
+    CK_TILE_DEVICE auto operator()(ODramWindowTmpD& o_direct_dram_window_tmp,
+                                   ODramWindowTmpQ& o_quant_dram_window_tmp,
+                                   YScaleWindow& y_scale_window,
+                                   const OAccTile& o_acc_tile,
+                                   void* smem)
+    {
+        Default2D{}(o_direct_dram_window_tmp, o_acc_tile, smem);
+        DynamicQuant{}(o_quant_dram_window_tmp, y_scale_window, o_acc_tile, smem);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
index 6e290fe6d7..1d6a99eb4b 100644
--- a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
@@ -25,7 +25,9 @@ struct Default2DEpilogueProblem
     static constexpr bool UseRawStore = UseRawStore_;
 };
 
-template <typename AccDataType_,
+template <typename ADataType_,
+          typename BDataType_,
+          typename AccDataType_,
           typename ODataType_,
           typename CLayout_,
           bool kPadM_,
@@ -38,6 +40,8 @@ template <typename AccDataType_,
 struct DefaultGemm2DEpilogueProblem
     : public Default2DEpilogueProblem<AccDataType_, ODataType_, kPadM_, kPadN_, UseRawStore_>
 {
+    using ADataType                        = remove_cvref_t<ADataType_>;
+    using BDataType                        = remove_cvref_t<BDataType_>;
     using CLayout                          = remove_cvref_t<CLayout_>;
     static constexpr index_t kMPerXdl      = kMPerXdl_;
     static constexpr index_t kNPerXdl      = kNPerXdl_;
@@ -96,17 +100,22 @@ struct Default2DEpilogue
 template <typename Problem_, typename Policy_ = void>
 struct DefaultGemm2DEpilogue : public Default2DEpilogue<Problem_, Policy_>
 {
-    using Problem                          = remove_cvref_t<Problem_>;
-    using AccDataType                      = remove_cvref_t<typename Problem::AccDataType>;
-    using ODataType                        = remove_cvref_t<typename Problem::ODataType>;
+    using Problem     = remove_cvref_t<Problem_>;
+    using ADataType   = remove_cvref_t<typename Problem::ADataType>;
+    using BDataType   = remove_cvref_t<typename Problem::BDataType>;
+    using AccDataType = remove_cvref_t<typename Problem::AccDataType>;
+    using ODataType   = remove_cvref_t<typename Problem::ODataType>;
+    // Used for weight-only quantization kernel, B would be dequantized to the same data type as A
+    using BTypeToUse =
+        std::conditional_t<std::is_same_v<BDataType, pk_int4_t>, ADataType, BDataType>;
     using CLayout                          = remove_cvref_t<typename Problem::CLayout>;
     static constexpr index_t kMPerXdl      = Problem::kMPerXdl;
     static constexpr index_t kNPerXdl      = Problem::kNPerXdl;
     static constexpr index_t kKPerXdl      = Problem::kKPerXdl;
     static constexpr index_t isCTransposed = Problem::isCTransposed;
 
-    using WG = WarpGemmMfmaDispatcher<ODataType,
-                                      ODataType,
+    using WG = WarpGemmMfmaDispatcher<ADataType,
+                                      BTypeToUse,
                                       AccDataType,
                                       kMPerXdl,
                                       kNPerXdl,
diff --git a/include/ck_tile/ops/flatmm.hpp b/include/ck_tile/ops/flatmm.hpp
index 82f6d48eda..1714789e63 100644
--- a/include/ck_tile/ops/flatmm.hpp
+++ b/include/ck_tile/ops/flatmm.hpp
@@ -3,10 +3,16 @@
 
 #pragma once
 
+#include "ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1.hpp"
+#include "ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1_custom_policy.hpp"
 #include "ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp"
 #include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp"
 #include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32_itl.hpp"
 #include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp"
+#include "ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp"
+#include "ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp"
+#include "ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp"
+#include "ck_tile/ops/flatmm/pipeline/tile_flatmm_shape.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1.hpp
new file mode 100644
index 0000000000..935eb2c028
--- /dev/null
+++ b/include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1.hpp
@@ -0,0 +1,187 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1_custom_policy.hpp"
+
+namespace ck_tile {
+
+// A is block window on shared memory
+// B is block window on shared memory
+// C is block distributed tensor
+template <typename Problem_, typename BlockPolicy_>
+struct BlockFlatmmASmemBSmemCRegV1
+{
+    using Problem        = remove_cvref_t<Problem_>;
+    using BlockPolicy    = remove_cvref_t<BlockPolicy_>;
+    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
+    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
+    using CDataType      = remove_cvref_t<typename Problem::CDataType>;
+    using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>; // TileFlatmmShape
+
+    static constexpr auto I0   = number<0>();
+    static constexpr auto I1   = number<1>();
+    static constexpr auto I2   = number<2>();
+    static constexpr auto idxM = I0;
+    static constexpr auto idxN = I1;
+    static constexpr auto idxK = I2;
+    using BlockTile            = remove_cvref_t<typename BlockGemmShape::BlockTile>;
+    using BlockWarps           = remove_cvref_t<typename BlockGemmShape::BlockWarps>;
+    using WarpTile             = remove_cvref_t<typename BlockGemmShape::WarpTile>;
+
+    static constexpr index_t kBlockSize = Problem::kBlockSize;
+
+    CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
+    {
+        constexpr index_t MPerBlock = BlockGemmShape::kM;
+        constexpr index_t NPerBlock = BlockGemmShape::kN;
+
+        constexpr auto config = BlockPolicy::template GetWarpGemmMWarpNWarp<Problem>();
+
+        using WG = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+
+        constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
+        constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WG::kN);
+
+        constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+            sequence<>,
+            tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
+            tuple<sequence<1, 2>>,
+            tuple<sequence<1, 1>>,
+            sequence<1, 2>,
+            sequence<0, 0>>{};
+
+        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
+
+        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
+
+        auto c_block_tensor = make_static_distributed_tensor<CDataType>(c_block_dstr);
+        return c_block_tensor;
+    }
+
+    // C += A * B
+    template <typename CBlockTensor, typename ABlockWindow, typename BFlatBlockWindow>
+    CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor,
+                                   const ABlockWindow& a_block_window,
+                                   const BFlatBlockWindow& b_flat_block_window) const
+    {
+        static_assert(std::is_same_v<ADataType, typename ABlockWindow::DataType> &&
+                          std::is_same_v<BDataType, typename BFlatBlockWindow::DataType> &&
+                          std::is_same_v<CDataType, typename CBlockTensor::DataType>,
+                      "wrong!");
+        constexpr index_t MPerBlock = ABlockWindow{}.get_window_lengths()[number<0>{}];
+        constexpr index_t KPerBlock = ABlockWindow{}.get_window_lengths()[number<1>{}];
+
+        static_assert(MPerBlock == BlockGemmShape::kM && KPerBlock == BlockGemmShape::kK, "wrong!");
+
+        constexpr auto config = BlockPolicy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WG              = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+
+        constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
+        constexpr index_t NIterPerWarp =
+            BlockTile::at(idxN) / (WarpTile::at(idxN) * BlockWarps::at(idxN));
+        constexpr index_t KIterPerWarp = KPerBlock / WG::kK;
+
+        constexpr index_t MPerBlockPerIter = MPerBlock / MIterPerWarp;
+        constexpr index_t KPerBlockPerIter = KPerBlock / KIterPerWarp;
+
+        constexpr index_t NFlatPerBlockPerIter = BlockGemmShape::flatNPerWarp;
+        constexpr index_t KFlatPerBlockPerIter = BlockGemmShape::flatKPerWarp;
+
+        const index_t iMWarp = get_warp_id() / NWarp;
+
+        // construct A-warp-window
+        auto a_warp_window_tmp = make_tile_window(
+            a_block_window.get_bottom_tensor_view(),
+            make_tuple(number<WG::kM>{}, number<WG::kK>{}),
+            a_block_window.get_window_origin() + multi_index<2>{iMWarp * WG::kM, 0},
+            make_static_tile_distribution(typename WG::AWarpDstrEncoding{}));
+        statically_indexed_array<
+            statically_indexed_array<decltype(a_warp_window_tmp), KIterPerWarp>,
+            MIterPerWarp>
+            a_warp_windows;
+        static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                a_warp_windows(mIter)(kIter) = a_warp_window_tmp;
+
+                move_tile_window(a_warp_windows(mIter)(kIter),
+                                 {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
+            });
+        });
+
+        // construct Bflat-warp-window
+        auto b_flat_warp_windows_tmp = b_flat_block_window;
+        statically_indexed_array<
+            statically_indexed_array<decltype(b_flat_warp_windows_tmp), KIterPerWarp>,
+            NIterPerWarp>
+            b_flat_warp_windows;
+        static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                b_flat_warp_windows(nIter)(kIter) = b_flat_warp_windows_tmp;
+
+                move_tile_window(b_flat_warp_windows(nIter)(kIter),
+                                 {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
+            });
+        });
+
+        // auto b_warp_windows = b_origin_warp_windows;
+        auto b_warp_windows = b_flat_warp_windows;
+
+        using CWarpDstr   = typename WG::CWarpDstr;
+        using CWarpTensor = typename WG::CWarpTensor;
+
+        constexpr auto c_warp_y_lengths =
+            to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+        constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
+
+        // hot loop:
+        static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                // read A warp tensor from A block window
+                const auto a_warp_tensor = load_tile(a_warp_windows(mIter)(kIter));
+
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    // read B warp tensor from B Block window
+                    const auto b_warp_tensor = load_tile(b_warp_windows(nIter)(kIter));
+
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
+
+                    c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+
+                    // warp GEMM
+                    WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+
+                    // write C warp tensor into C block tensor
+                    c_block_tensor.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
+                });
+            });
+        });
+    }
+
+    // C = A * B
+    template <typename ABlockTensorTmp, typename BFlatBlockWindow>
+    CK_TILE_DEVICE auto operator()(const ABlockTensorTmp& a_block_tensor_tmp,
+                                   const BFlatBlockWindow& b_flat_block_window) const
+    {
+        auto c_block_tensor = MakeCBlockTile();
+        operator()(c_block_tensor, a_block_tensor_tmp, b_flat_block_window);
+        return c_block_tensor;
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1_custom_policy.hpp b/include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1_custom_policy.hpp
new file mode 100644
index 0000000000..d5b062a1b3
--- /dev/null
+++ b/include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1_custom_policy.hpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+// Default policy for BlockGemmASmemBSmemCRegV1
+// Default policy class should not be templated, put template on member functions instead
+template <typename AType_,
+          typename BType_,
+          typename CType_,
+          typename BlockWarps_,
+          typename WarpGemm_>
+struct BlockFlatmmASmemBSmemCRegV1CustomPolicy
+{
+    using AType = remove_cvref_t<AType_>;
+    using BType = remove_cvref_t<BType_>;
+    using CType = remove_cvref_t<CType_>;
+
+    using BlockWarps = remove_cvref_t<BlockWarps_>;
+
+    static constexpr index_t kMWarps = BlockWarps::at(number<0>{});
+    static constexpr index_t kNWarps = BlockWarps::at(number<1>{});
+    static constexpr index_t kKWarps = BlockWarps::at(number<2>{});
+
+    using WarpGemm = remove_cvref_t<WarpGemm_>;
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetWarpGemmMWarpNWarp()
+    {
+        return make_tuple(WarpGemm{}, kMWarps, kNWarps);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
new file mode 100644
index 0000000000..eb45e6c0bd
--- /dev/null
+++ b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
@@ -0,0 +1,496 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
+
+namespace ck_tile {
+
+struct FlatmmProblem
+{
+    CK_TILE_HOST FlatmmProblem() = default;
+    CK_TILE_HOST FlatmmProblem(
+        index_t M_, index_t N_, index_t K_, index_t stride_A_, index_t stride_B_, index_t stride_C_)
+        : M(M_), N(N_), K(K_), stride_A(stride_A_), stride_B(stride_B_), stride_C(stride_C_)
+    {
+    }
+
+    index_t M;
+    index_t N;
+    index_t K;
+    index_t stride_A;
+    index_t stride_B;
+    index_t stride_C;
+};
+
+struct FlatmmHostArgs : public FlatmmProblem
+{
+    CK_TILE_HOST FlatmmHostArgs() = default;
+    CK_TILE_HOST FlatmmHostArgs(const void* a_ptr_,
+                                const void* b_shuffle_ptr_,
+                                void* c_ptr_,
+                                index_t k_batch_,
+                                index_t M_,
+                                index_t N_,
+                                index_t K_,
+                                index_t stride_A_,
+                                index_t stride_B_,
+                                index_t stride_C_)
+        : FlatmmProblem(M_, N_, K_, stride_A_, stride_B_, stride_C_),
+          a_ptr(a_ptr_),
+          b_shuffle_ptr(b_shuffle_ptr_),
+          c_ptr(c_ptr_),
+          k_batch(k_batch_)
+    {
+    }
+
+    const void* a_ptr;
+    const void* b_shuffle_ptr;
+    void* c_ptr;
+    index_t k_batch;
+};
+
+template <typename TilePartitioner_, typename FlatmmPipeline_, typename EpiloguePipeline_>
+struct FlatmmKernel
+{
+    using TilePartitioner = remove_cvref_t<TilePartitioner_>;
+    using FlatmmPipeline  = remove_cvref_t<FlatmmPipeline_>;
+    using BlockGemmShape =
+        remove_cvref_t<typename FlatmmPipeline::BlockGemmShape>; // TileFlatmmShape
+    using EpiloguePipeline                   = remove_cvref_t<EpiloguePipeline_>;
+    using ALayout                            = remove_cvref_t<typename FlatmmPipeline::ALayout>;
+    using BLayout                            = remove_cvref_t<typename FlatmmPipeline::BLayout>;
+    using CLayout                            = remove_cvref_t<typename FlatmmPipeline::CLayout>;
+    static constexpr index_t KernelBlockSize = FlatmmPipeline::BlockSize;
+
+    using ADataType = remove_cvref_t<typename FlatmmPipeline::ADataType>;
+    using BDataType = remove_cvref_t<typename FlatmmPipeline::BDataType>;
+    // Below type is actually accumulation data type - the output of block GEMM.
+    using CDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
+
+    static constexpr auto I0   = number<0>();
+    static constexpr auto I1   = number<1>();
+    static constexpr auto I2   = number<2>();
+    static constexpr auto idxM = I0;
+    static constexpr auto idxN = I1;
+    static constexpr auto idxK = I2;
+
+    [[nodiscard]] CK_TILE_HOST static const std::string GetName()
+    {
+        // clang-format off
+        return concat('_', "gemm", gemm_prec_str<ADataType, BDataType>, FlatmmPipeline::GetName());
+        // clang-format on
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(index_t M, index_t N, index_t KBatch)
+    {
+        return dim3(TilePartitioner::GridSize(M, N), 1, KBatch);
+    }
+
+    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
+
+    struct FlatmmKernelArgs
+    {
+        const void* a_ptr;
+        const void* b_shuffle_ptr;
+        void* c_ptr;
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t stride_A;
+        index_t stride_B;
+        index_t stride_C;
+        index_t k_batch;
+    };
+
+    CK_TILE_HOST static constexpr FlatmmKernelArgs MakeKernelArgs(const FlatmmHostArgs& hostArgs)
+    {
+        return FlatmmKernelArgs{hostArgs.a_ptr,
+                                hostArgs.b_shuffle_ptr,
+                                hostArgs.c_ptr,
+                                hostArgs.M,
+                                hostArgs.N,
+                                hostArgs.K,
+                                hostArgs.stride_A,
+                                hostArgs.stride_B,
+                                hostArgs.stride_C,
+                                hostArgs.k_batch};
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return max(FlatmmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
+    }
+
+    struct SplitKBatchOffset
+    {
+        __device__ SplitKBatchOffset(const FlatmmKernelArgs& kargs,
+                                     const std::size_t k_id = blockIdx.z)
+        {
+            constexpr auto K1   = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
+            const index_t K_t   = kargs.k_batch * K1;
+            const index_t KRead = (kargs.K + K_t - 1) / K_t * K1;
+
+            if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                a_k_split_offset = k_id * KRead;
+            }
+            else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                a_k_split_offset = k_id * KRead * kargs.stride_A;
+            }
+
+            if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
+            {
+                b_k_split_offset = k_id * KRead * kargs.stride_B;
+            }
+            else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
+            {
+                b_k_split_offset = k_id * KRead;
+            }
+
+            if(k_id < static_cast<uint32_t>(kargs.k_batch - 1))
+            {
+                splitted_k = KRead;
+            }
+            else
+            {
+                splitted_k = kargs.K - KRead * (kargs.k_batch - 1);
+            }
+        }
+
+        index_t a_k_split_offset;
+        index_t b_k_split_offset;
+        index_t splitted_k;
+    };
+
+    CK_TILE_HOST static bool IsSupportedArgument(const FlatmmKernelArgs& kargs)
+    {
+        if constexpr(EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                     is_any_of<CDataType, fp16_t, bf16_t>::value)
+        {
+            if(kargs.k_batch != 1)
+            {
+                std::cerr << "Conditions not met for Kbatch >1 !" << std::endl;
+                return false;
+            }
+        }
+
+        if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+        {
+            if(kargs.K % TilePartitioner::KPerBlock != 0 && FlatmmPipeline::kPadK == false)
+            {
+                std::cerr << "Can't support K that is not a multiple of KPerBlock"
+                             " without padding!"
+                          << std::endl;
+                return false;
+            }
+            if(kargs.K % FlatmmPipeline::GetVectorSizeA() != 0)
+            {
+                std::cerr << "K is not a multiple of vector load size for A tensor!" << std::endl;
+                return false;
+            }
+        }
+        else
+        {
+            if(kargs.M % TilePartitioner::MPerBlock != 0 && FlatmmPipeline::kPadM == false)
+            {
+                std::cerr << "Can't support M that is not a multiple of MPerBlock"
+                             " without padding!"
+                          << std::endl;
+                return false;
+            }
+            if(kargs.M % FlatmmPipeline::GetVectorSizeA() != 0)
+            {
+                std::cerr << "M is not a multiple of vector load size for A tensor!" << std::endl;
+                return false;
+            }
+        }
+
+        if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
+        {
+            if(kargs.N % TilePartitioner::NPerBlock != 0 && FlatmmPipeline::kPadN == false)
+            {
+                std::cerr << "Can't support N that is not a multiple of NPerBlock"
+                             " without padding!"
+                          << std::endl;
+                return false;
+            }
+            if(kargs.N % FlatmmPipeline::GetVectorSizeB() != 0)
+            {
+                std::cerr << "N is not a multiple of vector load size for B tensor!" << std::endl;
+                return false;
+            }
+        }
+        else
+        {
+            if(kargs.K % TilePartitioner::KPerBlock != 0 && FlatmmPipeline::kPadK == false)
+            {
+                std::cerr << "Can't support K that is not a multiple of KPerBlock"
+                             " without padding!"
+                          << std::endl;
+                return false;
+            }
+            if(kargs.K % FlatmmPipeline::GetVectorSizeB() != 0)
+            {
+                std::cerr << "K is not a multiple of vector load size for B tensor!" << std::endl;
+                return false;
+            }
+        }
+
+        if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+        {
+            if(kargs.N % TilePartitioner::NPerBlock != 0 && FlatmmPipeline::kPadN == false)
+            {
+                std::cerr << "Can't support N that is not a multiple of NPerBlock"
+                             " without padding!"
+                          << std::endl;
+                return false;
+            }
+            if(kargs.N % EpiloguePipeline::GetVectorSizeC() != 0)
+            {
+                std::cerr << "N is not a multiple of vector load size for C tensor!" << std::endl;
+                return false;
+            }
+        }
+        else
+        {
+            if(kargs.M % TilePartitioner::MPerBlock != 0 && FlatmmPipeline::kPadM == false)
+            {
+                std::cerr << "Can't support M that is not a multiple of MPerBlock"
+                             " without padding!"
+                          << std::endl;
+                return false;
+            }
+            if(kargs.M % EpiloguePipeline::GetVectorSizeC() != 0)
+            {
+                std::cerr << "M is not a multiple of vector load size for C tensor!" << std::endl;
+                return false;
+            }
+        }
+        return true;
+    }
+
+    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
+    CK_TILE_DEVICE static auto MakeGemmTensorViews(const ADataType* a_ptr,
+                                                   const BDataType* b_flat_ptr,
+                                                   CDataType* c_ptr,
+                                                   const FlatmmKernelArgs& kargs,
+                                                   const SplitKBatchOffset& splitk_batch_offset)
+    {
+        const auto& a_tensor_view = [&]() {
+            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    a_ptr,
+                    make_tuple(kargs.M, splitk_batch_offset.splitted_k),
+                    make_tuple(kargs.stride_A, 1),
+                    number<FlatmmPipeline::GetVectorSizeA()>{},
+                    number<1>{});
+            }
+            else
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    a_ptr,
+                    make_tuple(splitk_batch_offset.splitted_k, kargs.M),
+                    make_tuple(kargs.stride_A, 1),
+                    number<FlatmmPipeline::GetVectorSizeA()>{},
+                    number<1>{});
+            }
+        }();
+
+        index_t kFlatK = FlatmmPipeline::flatKPerWarp * (splitk_batch_offset.splitted_k /
+                                                         BlockGemmShape::WarpTile::at(number<2>{}));
+        index_t kFlatN = kargs.N * kargs.K / kFlatK;
+        const auto& b_flat_tensor_view = [&]() {
+            return make_naive_tensor_view<address_space_enum::global>(
+                b_flat_ptr,
+                make_tuple(kFlatN, kFlatK),
+                make_tuple(kFlatK, 1),
+                number<FlatmmPipeline::GetVectorSizeB()>{},
+                number<1>{});
+        }();
+
+        // TODO: enable vector write for C in ColMajor
+        const auto& c_tensor_view = [&]() {
+            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
+                    c_ptr,
+                    make_tuple(kargs.M, kargs.N),
+                    make_tuple(kargs.stride_C, 1),
+                    number<EpiloguePipeline::GetVectorSizeC()>{},
+                    number<1>{});
+            }
+            else
+            {
+                return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
+                    c_ptr,
+                    make_tuple(kargs.M, kargs.N),
+                    make_tuple(1, kargs.stride_C),
+                    number<1>{},
+                    number<1>{});
+            }
+        }();
+
+        return make_tuple(a_tensor_view, b_flat_tensor_view, c_tensor_view);
+    }
+
+    template <typename TensorView>
+    CK_TILE_DEVICE static auto MakeGemmPadViews(const TensorView& views)
+    {
+        const auto& a_pad_view = [&]() {
+            const auto& a_tensor_view = views.at(I0);
+            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+            {
+                return pad_tensor_view(a_tensor_view,
+                                       make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                  number<TilePartitioner::KPerBlock>{}),
+                                       sequence<false, FlatmmPipeline::kPadK>{});
+            }
+            else
+            {
+                return pad_tensor_view(a_tensor_view,
+                                       make_tuple(number<TilePartitioner::KPerBlock>{},
+                                                  number<TilePartitioner::MPerBlock>{}),
+                                       sequence<false, FlatmmPipeline::kPadM>{});
+            }
+        }();
+
+        const auto& b_flat_tensor_view = views.at(I1);
+
+        // TODO vector write in for C in ColMajor
+        const auto& c_pad_view = [&]() {
+            const auto& c_tensor_view = views.at(I2);
+            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+            {
+                return pad_tensor_view(c_tensor_view,
+                                       make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                  number<TilePartitioner::NPerBlock>{}),
+                                       sequence<false, FlatmmPipeline::kPadN>{});
+            }
+            else
+            {
+                return pad_tensor_view(c_tensor_view,
+                                       make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                  number<TilePartitioner::NPerBlock>{}),
+                                       sequence<FlatmmPipeline::kPadM, false>{});
+            }
+        }();
+
+        return make_tuple(a_pad_view, b_flat_tensor_view, c_pad_view);
+    }
+
+    template <typename PadView>
+    CK_TILE_DEVICE static auto
+    MakeGemmTileWindows(const PadView& views, const index_t i_m, const index_t i_n)
+    {
+        const auto& a_pad_view      = views.at(I0);
+        const auto& b_flat_pad_view = views.at(I1);
+        const auto& c_pad_view      = views.at(I2);
+
+        const auto& a_block_window = [&]() {
+            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_tile_window(a_pad_view,
+                                        make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                   number<TilePartitioner::KPerBlock>{}),
+                                        {i_m, 0});
+            }
+            else
+            {
+                return make_tile_window(a_pad_view,
+                                        make_tuple(number<TilePartitioner::KPerBlock>{},
+                                                   number<TilePartitioner::MPerBlock>{}),
+                                        {0, i_m});
+            }
+        }();
+
+        const auto& b_flat_block_window =
+            make_tile_window(b_flat_pad_view,
+                             make_tuple(number<FlatmmPipeline::flatNPerWarp>{},
+                                        number<FlatmmPipeline::flatKPerWarp>{}),
+                             {static_cast<int>(i_n / BlockGemmShape::WarpTile::at(idxN)), 0});
+
+        auto c_block_window = make_tile_window(
+            c_pad_view,
+            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
+            {i_m, i_n});
+
+        return make_tuple(a_block_window, b_flat_block_window, c_block_window);
+    }
+
+    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
+    CK_TILE_DEVICE static void RunFlatmm(const ADataType* a_ptr,
+                                         const BDataType* b_flat_ptr,
+                                         CDataType* c_ptr,
+                                         void* smem_ptr,
+                                         const FlatmmKernelArgs& kargs,
+                                         const SplitKBatchOffset& splitk_batch_offset,
+                                         const index_t block_idx_m,
+                                         const index_t block_idx_n)
+    {
+        // Create Gemm tensor views, pad views and tile windows
+        const auto& gemm_tensor_views_tuple =
+            MakeGemmTensorViews<DstInMemOp>(a_ptr, b_flat_ptr, c_ptr, kargs, splitk_batch_offset);
+        const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
+        auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+
+        const index_t num_loop = TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k);
+
+        // Run GEMM cooperatively by whole workgroup.
+        const auto& a_block_window      = gemm_tile_windows.at(I0);
+        const auto& b_flat_block_window = gemm_tile_windows.at(I1);
+        const auto& c_block_tile        = FlatmmPipeline{}.template operator()(
+            a_block_window, b_flat_block_window, num_loop, smem_ptr);
+
+        // Run Epilogue Pipeline
+        auto& c_block_window = gemm_tile_windows.at(I2);
+
+        EpiloguePipeline{}
+            .template operator()<decltype(c_block_window), decltype(c_block_tile), DstInMemOp>(
+                c_block_window, c_block_tile, smem_ptr);
+    }
+
+    CK_TILE_DEVICE void operator()(FlatmmKernelArgs kargs) const
+    {
+        const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockIdx.x);
+        const index_t i_m   = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
+        const index_t i_n   = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+
+        const SplitKBatchOffset splitk_batch_offset(kargs);
+        // options
+        const ADataType* a_ptr =
+            static_cast<const ADataType*>(kargs.a_ptr) + splitk_batch_offset.a_k_split_offset;
+        const BDataType* b_flat_ptr = static_cast<const BDataType*>(kargs.b_shuffle_ptr) +
+                                      splitk_batch_offset.b_k_split_offset;
+        CDataType* c_ptr = static_cast<CDataType*>(kargs.c_ptr);
+
+        // allocate LDS
+        __shared__ char smem_ptr[GetSmemSize()];
+
+        if(kargs.k_batch == 1)
+        {
+            RunFlatmm(a_ptr, b_flat_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
+        }
+        else
+        {
+            // Do not compile in case where we have unsupported
+            // VectorSizeC & data type configuration.
+            if constexpr(!(EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                           is_any_of<CDataType, fp16_t, bf16_t>::value))
+            {
+                RunFlatmm<memory_operation_enum::atomic_add>(
+                    a_ptr, b_flat_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
+            }
+        }
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
new file mode 100644
index 0000000000..611aff318f
--- /dev/null
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/concat.hpp"
+#include "ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp"
+
+namespace ck_tile {
+
+template <typename Problem, typename PipelinePolicy = UniversalFlatmmPipelineAgBgCrPolicy>
+struct FlatmmPipelineAGmemBGmemCRegV1
+{
+    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
+    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
+    using CDataType      = remove_cvref_t<typename Problem::CDataType>;
+    using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>; // TileFlatmmShape
+
+    using ALayout = remove_cvref_t<typename Problem::ALayout>;
+    using BLayout = remove_cvref_t<typename Problem::BLayout>;
+    using CLayout = remove_cvref_t<typename Problem::CLayout>;
+
+    using BlockFlatmm =
+        remove_cvref_t<decltype(PipelinePolicy::template GetBlockFlatmm<Problem>())>;
+
+    static constexpr index_t BlockSize = Problem::kBlockSize;
+
+    static constexpr index_t kMPerBlock = BlockGemmShape::kM;
+    static constexpr index_t kNPerBlock = BlockGemmShape::kN;
+    static constexpr index_t kKPerBlock = BlockGemmShape::kK;
+
+    static constexpr index_t flatKPerWarp = BlockGemmShape::flatKPerWarp;
+    static constexpr index_t flatNPerWarp = BlockGemmShape::flatNPerWarp;
+
+    static constexpr index_t GetVectorSizeA() { return Problem::VectorSizeA; }
+    static constexpr index_t GetVectorSizeB() { return Problem::VectorSizeB; }
+    static constexpr index_t GetVectorSizeC() { return Problem::VectorSizeC; }
+
+    static constexpr bool kPadM = Problem::kPadM;
+    static constexpr bool kPadN = Problem::kPadN;
+    static constexpr bool kPadK = Problem::kPadK;
+
+    static constexpr index_t kLdsAlignmentInBytes = 16;
+
+    static constexpr auto I0   = number<0>();
+    static constexpr auto I1   = number<1>();
+    static constexpr auto I2   = number<2>();
+    static constexpr auto idxM = I0;
+    static constexpr auto idxN = I1;
+    static constexpr auto idxK = I2;
+    using BlockTile            = remove_cvref_t<typename BlockGemmShape::BlockTile>;
+    using BlockWarps           = remove_cvref_t<typename BlockGemmShape::BlockWarps>;
+    using WarpTile             = remove_cvref_t<typename BlockGemmShape::WarpTile>;
+
+    [[nodiscard]] CK_TILE_HOST static const std::string GetName()
+    {
+        // clang-format off
+        return concat('_', "pipeline_AGmemBGmemCRegV1", 
+                      concat('x', kMPerBlock, kNPerBlock, kKPerBlock,  BlockSize),
+                      concat('x', GetVectorSizeA(), GetVectorSizeB(), GetVectorSizeC()),
+                      concat('x', kPadM, kPadN, kPadK));
+        // clang-format on
+    }
+
+    // For the basic gemm pipelien DoubleSmemBuffer set to be false naturally.
+    static constexpr bool DoubleSmemBuffer = false;
+
+    CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; }
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return PipelinePolicy::template GetSmemSize<Problem>();
+    }
+
+    template <typename ADramBlockWindowTmp, typename BFlatBlockWindowTmp, typename AElementFunction>
+    CK_TILE_HOST_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                        const AElementFunction& a_element_func,
+                                        const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
+                                        index_t num_loop,
+                                        void* p_smem) const
+    {
+        static_assert(
+            std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>>,
+            "wrong!");
+
+        static_assert(kMPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<0>{}],
+                      "wrong!");
+        static_assert(kKPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<1>{}],
+                      "wrong!");
+
+        // A tile in LDS
+        ADataType* p_a_lds = static_cast<ADataType*>(p_smem);
+
+        constexpr auto a_lds_block_desc =
+            PipelinePolicy::template MakeALdsBlockDescriptor<Problem>();
+
+        auto a_lds_block = make_tensor_view<address_space_enum::lds>(p_a_lds, a_lds_block_desc);
+
+        // A DRAM tile window for load
+        auto a_copy_dram_window =
+            make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(),
+                             make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}),
+                             a_dram_block_window_tmp.get_window_origin(),
+                             PipelinePolicy::template MakeADramTileDistribution<Problem>());
+
+        // A LDS tile window for store
+        auto a_copy_lds_window = make_tile_window(
+            a_lds_block, make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}), {0, 0});
+
+        // A LDS tile for block GEMM
+        auto a_lds_gemm_window = make_tile_window(
+            a_lds_block, make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}), {0, 0});
+
+        // Block GEMM
+        auto block_flatmm = BlockFlatmm();
+
+        // B flat DRAM window for load
+        auto b_flat_distribution =
+            PipelinePolicy::template MakeBFlatDramTileDistribution<Problem>();
+        auto b_flat_dram_window = // tile_window_with_static_distribution
+            make_tile_window(
+                b_flat_dram_block_window_tmp.get_bottom_tensor_view(), // from kernel gemm_pad_views
+                make_tuple(number<flatNPerWarp>{}, number<flatKPerWarp>{}),
+                b_flat_dram_block_window_tmp.get_window_origin(),
+                b_flat_distribution);
+
+        // Acc register tile
+        auto c_block_tile = decltype(block_flatmm(a_lds_gemm_window, b_flat_dram_window)){};
+
+        // prefetch
+        // global read 0
+        auto a_block_tile = load_tile(a_copy_dram_window);
+
+        {
+            // move to 1
+            move_tile_window(a_copy_dram_window, {0, kKPerBlock});
+
+            // initialize C
+            tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
+
+            // LDS write 0
+            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::ColumnMajor>)
+            {
+                auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
+                    PipelinePolicy::template MakeShuffledARegBlockDistribution<Problem>());
+                shuffle_tile(a_shuffle_tmp, a_block_tile);
+                const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_shuffle_tmp);
+                store_tile(a_copy_lds_window, a_block_tile_tmp);
+            }
+            else
+            {
+                store_tile(a_copy_lds_window, tile_elementwise_in(a_element_func, a_block_tile));
+            }
+        }
+
+        index_t iCounter = num_loop - 1;
+        while(iCounter > 0)
+        {
+            // global read i + 1
+            a_block_tile = load_tile(a_copy_dram_window);
+
+            block_sync_lds();
+
+            // GEMM i
+            block_flatmm(c_block_tile, a_lds_gemm_window, b_flat_dram_window);
+
+            block_sync_lds();
+
+            // move to i + 2
+            move_tile_window(a_copy_dram_window, {0, kKPerBlock});
+
+            // LDS write i + 1
+            const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
+            store_tile(a_copy_lds_window, a_block_tile_tmp);
+
+            // move to next flat K
+            move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock});
+
+            iCounter--;
+        }
+
+        // tail
+        {
+            block_sync_lds();
+
+            // GEMM num_loop - 1
+            block_flatmm(c_block_tile, a_lds_gemm_window, b_flat_dram_window);
+        }
+
+        return c_block_tile;
+    }
+
+    template <typename ADramBlockWindowTmp, typename BFlatBlockWindowTmp>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
+                                   index_t num_loop,
+                                   void* p_smem) const
+    {
+        return operator()(
+            a_dram_block_window_tmp,
+            [](const ADataType& a) { return a; },
+            b_flat_dram_block_window_tmp,
+            num_loop,
+            p_smem);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
new file mode 100644
index 0000000000..d1aac07d54
--- /dev/null
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
@@ -0,0 +1,265 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
+
+namespace ck_tile {
+
+struct UniversalFlatmmPipelineAgBgCrPolicy
+{
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};
+    static constexpr auto I2 = number<2>{};
+
+    // 3d + padding
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
+    {
+        using namespace ck_tile;
+
+        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+
+        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<kKPerBlock / 8>{}, number<kMPerBlock>{}, number<8>{}),
+            make_tuple(number<(kMPerBlock + 1) * 8>{}, number<8>{}, number<1>{}),
+            number<8>{},
+            number<1>{});
+
+        constexpr auto a_lds_block_desc = transform_tensor_descriptor(
+            a_lds_block_desc_0,
+            make_tuple(make_pass_through_transform(kMPerBlock),
+                       make_merge_transform(make_tuple(kKPerBlock / 8, 8))),
+            make_tuple(sequence<1>{}, sequence<0, 2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return a_lds_block_desc;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeA()
+    {
+        constexpr index_t smem_size_a = sizeof(typename Problem::ADataType) *
+                                        MakeALdsBlockDescriptor<Problem>().get_element_space_size();
+        return smem_size_a;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        constexpr index_t smem_size_a = GetSmemSizeA<Problem>();
+
+        return smem_size_a;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemPackA()
+    {
+        return Problem::VectorLoadSize;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeADramTileDistribution()
+    {
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+        using ALayout   = remove_cvref_t<typename Problem::ALayout>;
+
+        constexpr index_t BlockSize = Problem::kBlockSize;
+
+        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+
+        if constexpr(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::ColumnMajor>)
+        {
+            constexpr index_t M1           = Problem::VectorLoadSize / sizeof(ADataType);
+            constexpr index_t M0           = MPerBlock / M1;
+            constexpr index_t total_pixels = MPerBlock * KPerBlock / BlockSize;
+            static_assert(total_pixels % M1 == 0);
+            constexpr index_t K3    = total_pixels / M1;
+            constexpr index_t KPack = GetSmemPackA<Problem>();
+            static_assert(KPack % K3 == 0);
+            constexpr index_t K2 = KPack / K3;
+            if constexpr(get_warp_size() % (K2 * M0))
+            {
+                constexpr index_t K1 = get_warp_size() / (K2 * M0);
+                constexpr index_t K0 = BlockSize / get_warp_size();
+                static_assert(KPerBlock == K0 * K1 * K2 * K3);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1>, sequence<K0, K1, K2, K3>>,
+                                               tuple<sequence<2>, sequence<2, 1, 2>>,
+                                               tuple<sequence<0>, sequence<1, 0, 2>>,
+                                               sequence<2, 1>,
+                                               sequence<3, 1>>{});
+            }
+            else
+            {
+                constexpr index_t K1   = (K2 * M0) / get_warp_size();
+                constexpr index_t K2_m = K2 / K1;
+                constexpr index_t K0   = BlockSize / get_warp_size() / K1;
+                static_assert(KPerBlock == K0 * K1 * K2_m * K3);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1>, sequence<K0, K1, K2_m, K3>>,
+                                               tuple<sequence<2, 2>, sequence<1, 2>>,
+                                               tuple<sequence<0, 1>, sequence<0, 2>>,
+                                               sequence<2, 1>,
+                                               sequence<3, 1>>{});
+            }
+        }
+        else
+        {
+            constexpr index_t K1 = 16 / sizeof(ADataType);
+            constexpr index_t K0 = KPerBlock / K1;
+            constexpr index_t M2 = get_warp_size() / K0;
+            // coalesce reading for each blocks
+            if constexpr(get_warp_size() % (M2 * K0) == 0)
+            {
+                constexpr index_t M1 = BlockSize / get_warp_size();
+                static_assert(M2 != 0, "M2 is zero, which will lead to a division by zero error.");
+                static_assert(M1 != 0, "M1 is zero, which will lead to a division by zero error.");
+                constexpr index_t M0 = MPerBlock / (M2 * M1);
+                static_assert(M0 * M1 * M2 == MPerBlock,
+                              "Incorrect M0, M2, M1 configuration! "
+                              "M0, M1, M2 must cover whole MPerBlock!");
+
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
+                                               tuple<sequence<1>, sequence<1, 2>>,
+                                               tuple<sequence<1>, sequence<2, 0>>,
+                                               sequence<1, 2>,
+                                               sequence<0, 1>>{});
+            }
+            else
+            {
+                constexpr index_t M0 = BlockSize / get_warp_size();
+                constexpr index_t M1 = MPerBlock / (M2 * M0);
+                static_assert(M0 * M1 * M2 == MPerBlock,
+                              "Incorrect M0, M1, M2 configuration! "
+                              "M0, M1, M2 must cover whole MPerBlock!");
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
+                                               tuple<sequence<1>, sequence<1, 2>>,
+                                               tuple<sequence<0>, sequence<2, 0>>,
+                                               sequence<1, 2>,
+                                               sequence<1, 1>>{});
+            }
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBFlatDramTileDistribution()
+    {
+        using BDataType = remove_cvref_t<typename Problem::BDataType>;
+
+        using TileShape = typename Problem::BlockGemmShape; // ck_tile::TileFlatmmShape
+
+        constexpr index_t BlockSize = Problem::kBlockSize;
+        constexpr index_t WaveSize  = get_warp_size();
+        constexpr index_t WaveNum   = BlockSize / WaveSize;
+
+        constexpr index_t KBPerLoad =
+            Problem::VectorLoadSize / sizeof(BDataType); // dwordx4 load B elem cnt
+        constexpr index_t KThdPerWave = WaveSize;        // threads cnt in K dim
+        constexpr index_t KWavePerBlk = 1;
+        constexpr index_t KRepeat     = 1;
+
+        constexpr index_t NBPerLoad   = 1;
+        constexpr index_t NThdPerWave = 1;
+        constexpr index_t NWavePerBlk = TileShape::BlockWarps::at(TileShape::idxN); // N_Warp
+        constexpr index_t NRepeat     = 1;
+
+        constexpr index_t WaveRepeat = WaveNum / TileShape::flatNPerWarp;
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<WaveRepeat>,                                          // ?
+                tuple<sequence<NRepeat, NWavePerBlk, NThdPerWave, NBPerLoad>,  // second direction
+                      sequence<KRepeat, KWavePerBlk, KThdPerWave, KBPerLoad>>, // first  direction
+                // wave in blk,     // thd in wave
+                // <M, K>           // <M, K>
+                tuple<sequence<0, 1, 2>, sequence<1, 2>>, // which direction
+                tuple<sequence<0, 1, 1>, sequence<2, 2>>, // which index
+                // <repeat, vec_load>
+                sequence<1, 1, 2, 2>,
+                sequence<0, 3, 0, 3>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeShuffledARegBlockDistribution()
+    {
+        using ALayout   = remove_cvref_t<typename Problem::ALayout>;
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+        static_assert(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::ColumnMajor>);
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+
+        constexpr index_t M1           = Problem::VectorLoadSize / sizeof(ADataType);
+        constexpr index_t M0           = kMPerBlock / M1;
+        constexpr index_t total_pixels = kMPerBlock * kKPerBlock / kBlockSize;
+        static_assert(total_pixels % M1 == 0);
+        constexpr index_t K3     = total_pixels / M1;
+        constexpr index_t kKPack = GetSmemPackA<Problem>();
+        static_assert(kKPack % K3 == 0);
+        constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave
+        constexpr index_t warp_size = get_warp_size();
+        if constexpr(warp_size % (K2 * M0) == 0)
+        {
+            constexpr index_t K1 = warp_size / (K2 * M0);
+            constexpr index_t K0 = kBlockSize / warp_size;
+
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<M0, M1>, sequence<K0, K1, K2, K3>>,
+                                           tuple<sequence<2>, sequence<2, 1, 2>>,
+                                           tuple<sequence<0>, sequence<1, 0, 2>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 3>>{});
+        }
+        else
+        {
+            constexpr index_t K1   = (K2 * M0) / get_warp_size();
+            constexpr index_t K2_m = K2 / K1;
+            constexpr index_t K0   = kBlockSize / get_warp_size() / K1;
+            static_assert(kKPerBlock == K0 * K1 * K2_m * K3);
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<M0, M1>, sequence<K0, K1, K2_m, K3>>,
+                                           tuple<sequence<2, 2>, sequence<1, 2>>,
+                                           tuple<sequence<0, 1>, sequence<0, 2>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 3>>{});
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockFlatmm()
+    {
+        using AccDataType = float;
+        using BlockWarps  = typename Problem::BlockGemmShape::BlockWarps;
+        using WarpTile    = typename Problem::BlockGemmShape::WarpTile;
+        using WarpGemm    = WarpGemmMfmaDispatcher<typename Problem::ADataType,
+                                                typename Problem::BDataType,
+                                                AccDataType,
+                                                WarpTile::at(I0),
+                                                WarpTile::at(I1),
+                                                WarpTile::at(I2),
+                                                Problem::TransposeC>;
+
+        using BlockFlatmmPolicy =
+            BlockFlatmmASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
+                                                    typename Problem::BDataType,
+                                                    typename Problem::CDataType,
+                                                    BlockWarps,
+                                                    WarpGemm>;
+        return BlockFlatmmASmemBSmemCRegV1<Problem, BlockFlatmmPolicy>{};
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/flatmm/pipeline/tile_flatmm_shape.hpp b/include/ck_tile/ops/flatmm/pipeline/tile_flatmm_shape.hpp
new file mode 100644
index 0000000000..551d390ec6
--- /dev/null
+++ b/include/ck_tile/ops/flatmm/pipeline/tile_flatmm_shape.hpp
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/concat.hpp"
+
+namespace ck_tile {
+
+template <typename BlockTile_, typename BlockWarps_, typename WarpTile_>
+struct TileFlatmmShape
+{
+    using BlockTile  = remove_cvref_t<BlockTile_>;
+    using BlockWarps = remove_cvref_t<BlockWarps_>;
+    using WarpTile   = remove_cvref_t<WarpTile_>;
+
+    static constexpr auto idxM = number<0>{};
+    static constexpr auto idxN = number<1>{};
+    static constexpr auto idxK = number<2>{};
+
+    static constexpr index_t NumWarps = reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{});
+
+    static constexpr index_t kM = BlockTile::at(idxM);
+    static constexpr index_t kN = BlockTile::at(idxN);
+    static constexpr index_t kK = BlockTile::at(idxK);
+
+    static constexpr index_t flatNPerWarp  = BlockWarps::at(idxN);
+    static constexpr index_t flatKPerWarp  = WarpTile::at(idxK) * WarpTile::at(idxN);
+    static constexpr index_t flatKPerBlock = flatKPerWarp * kK / WarpTile::at(idxK);
+
+    CK_TILE_HOST static std::string GetName()
+    {
+        // clang-format off
+        return concat('_', "tile_flatmm_shape",
+                      concat('x', kM, kN, kK, NumWarps),
+                      concat('x', BlockWarps::at(idxM), BlockWarps::at(idxN), BlockWarps::at(idxK)),
+                      concat('x', (WarpTile::at(idxM)), WarpTile::at(idxN), WarpTile::at(idxK)));
+        // clang-format on
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha.hpp b/include/ck_tile/ops/fmha.hpp
index c896534e03..a28b63f813 100644
--- a/include/ck_tile/ops/fmha.hpp
+++ b/include/ck_tile/ops/fmha.hpp
@@ -37,6 +37,8 @@
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_default_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_default_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_fp8.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch_default_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs_default_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp"
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
index 23174528e7..35b2f02e8a 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
@@ -100,10 +100,10 @@ struct FmhaBwdDQDKDVKernel
             "r" + _TS_(gbr4::at(ck_tile::number<0>{})) + "x" + _TS_(gbr4::at(ck_tile::number<1>{})) + "x" + _TS_(gbr4::at(ck_tile::number<2>{})) + "_" +
             "w" + _TS_(gwt0::at(ck_tile::number<0>{})) + "x" + _TS_(gwt0::at(ck_tile::number<1>{})) + "x" + _TS_(gwt0::at(ck_tile::number<2>{})) + "_" +
             "w" + _TS_(gwt1::at(ck_tile::number<0>{})) + "x" + _TS_(gwt1::at(ck_tile::number<1>{})) + "x" + _TS_(gwt1::at(ck_tile::number<2>{})) + "_" +
-            ("o" + _TS_(kBlockPerCu) + "_") + _SS_(FmhaPipeline::name) + (pn.empty() ? "" : "_" + pn) +
-            (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
-            (kHasBiasGrad ? "_dbias" : "") + (kHasMask ? "_" + _SS_(FmhaMask::name) : "") + (kHasDropout ? "_dropout" : "" ) +
-            (kIsStoreRandval ? "_storerandval" : "" ) + (kIsDeterministic ? "_deterministic" : "" );
+            ("o" + _TS_(kBlockPerCu) + "_") + _SS_(FmhaPipeline::name) + (pn.empty() ? "_npad" : "_" + pn) +
+            (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("_nbias") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
+            (kHasBiasGrad ? "_dbias" : "_ndbias") + (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kHasDropout ? "_dropout" : "_ndropout" ) +
+            (kIsStoreRandval ? "_storerandval" : "" ) + (kIsDeterministic ? "_deterministic" : "_ndeterministic" );
         #undef _SS_
         #undef _TS_
         // clang-format on
@@ -1620,7 +1620,7 @@ struct FmhaBwdOGradDotOKernel
         return
             _SS_("fmha_bwd_dot_do_o_d") + _TS_(kVHeaddim) + "_" + _SS_(t2s<ODataType>::name) +
             "_" + (kIsGroupMode ? "group" : "batch") + "_" +
-            ("o" + _TS_(kBlockPerCu)) + (pn.empty() ? "" : "_" + pn);
+            ("o" + _TS_(kBlockPerCu)) + (pn.empty() ? "_npad" : "_" + pn);
         #undef _SS_
         #undef _TS_
         // clang-format on
@@ -1875,8 +1875,8 @@ struct FmhaBwdConvertQGradKernel
             return n.empty() ? n : std::string("p") + n; }();
         return
             _SS_("fmha_bwd_convert_dq_d") + _TS_(kQKHeaddim) + "_" + _SS_(t2s<QGradDataType>::name) +
-            "_" + (kIsGroupMode ? "group" : "batch") + (kIsDeterministic ? "_deterministic" : "") + "_" +
-            ("o" + _TS_(kBlockPerCu)) + (pn.empty() ? "" : "_" + pn);
+            "_" + (kIsGroupMode ? "group" : "batch") + "_" + ("o" + _TS_(kBlockPerCu)) + (pn.empty() ? "_npad" : "_" + pn) + 
+            (kIsDeterministic ? "_deterministic" : "_ndeterministic") ;
         #undef _SS_
         #undef _TS_
         // clang-format on
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
index f107b10dff..1202524950 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -54,6 +54,8 @@ struct FmhaFwdKernel
     using FmhaMask                 = ck_tile::remove_cvref_t<typename FmhaPipeline::FmhaMask>;
     static constexpr bool kHasMask = FmhaMask::IsMasking;
 
+    static constexpr bool kUseAsyncCopy = FmhaPipeline::Policy::AsyncCopy;
+
     // clang-format off
     template <typename T> struct t2s;
     template <> struct t2s<float> { static constexpr const char * name = "fp32"; };
@@ -91,9 +93,9 @@ struct FmhaFwdKernel
             "w" + _TS_(g0wt::at(ck_tile::number<0>{})) + "x" + _TS_(g0wt::at(ck_tile::number<1>{})) + "x" + _TS_(g0wt::at(ck_tile::number<2>{})) + "_" +
             "w" + _TS_(g1wt::at(ck_tile::number<0>{})) + "x" + _TS_(g1wt::at(ck_tile::number<1>{})) + "x" + _TS_(g1wt::at(ck_tile::number<2>{})) + "_" +
             (kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) + _SS_(FmhaPipeline::name) + "_" +
-            "v" + (std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor> ? "r" : "c") + (pn.empty() ? "" : "_" + pn) +
-            (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
-            (kHasMask ? "_" + _SS_(FmhaMask::name) : "") + (kStoreLSE ? "_lse" : "" ) + (kHasDropout ? "_dropout" : "" ) + (kDoFp8StaticQuant ? "_squant" : "" );
+            "v" + (std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor> ? "r" : "c") + (pn.empty() ? "_npad" : "_" + pn) +
+            (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("_nbias") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
+            (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kStoreLSE ? "_lse" : "_nlse" ) + (kHasDropout ? "_dropout" : "_ndropout" ) + (kDoFp8StaticQuant ? "_squant" : "_nsquant" );
         #undef _SS_
         #undef _TS_
         // clang-format on
@@ -981,7 +983,7 @@ struct FmhaFwdKernel
             }
             if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
             {
-                batch_offset_bias = query_start * kargs.stride_bias + key_start;
+                batch_offset_bias = query_start * kargs.stride_bias;
             }
             if constexpr(kStoreLSE)
             {
@@ -1082,10 +1084,11 @@ struct FmhaFwdKernel
                 number<FmhaPipeline::kAlignmentK>{},
                 number<1>{});
 
+            constexpr bool kPadSeqLenK_ = kUseAsyncCopy ? kPadSeqLenK : false;
             return pad_tensor_view(
                 k_dram_naive,
                 make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
-                sequence<kPadSeqLenK, kPadHeadDimQ>{});
+                sequence<kPadSeqLenK_, kPadHeadDimQ>{});
         }();
         const auto v_dram = [&]() {
             if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
@@ -1104,10 +1107,11 @@ struct FmhaFwdKernel
                                           make_tuple(sequence<1>{}, sequence<0>{}),
                                           make_tuple(sequence<0>{}, sequence<1>{}));
 
+                constexpr bool kPadSeqLenK_ = kUseAsyncCopy ? kPadSeqLenK : false;
                 return pad_tensor_view(
                     v_dram_transposed,
                     make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                    sequence<kPadHeadDimV, kPadSeqLenK>{});
+                    sequence<kPadHeadDimV, kPadSeqLenK_>{});
             }
             else
             {
@@ -1118,10 +1122,11 @@ struct FmhaFwdKernel
                     number<FmhaPipeline::kAlignmentV>{},
                     number<1>{});
 
+                constexpr bool kPadHeadDimV_ = kUseAsyncCopy ? kPadHeadDimV : false;
                 return pad_tensor_view(
                     v_dram_naive,
                     make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                    sequence<kPadHeadDimV, kPadSeqLenK>{});
+                    sequence<kPadHeadDimV_, kPadSeqLenK>{});
             }
         }();
 
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
index a342a91f10..99ee912db9 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
@@ -54,9 +54,9 @@ struct FmhaFwdSplitKVCombineKernel
             "b" + _TS_(FmhaPipeline::kN1) + "_" +
             (kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) +
             _SS_(FmhaPipeline::name) +
-            (pn.empty() ? "" : "_" + pn) +
-            (kStoreLSE ? "_lse" : "" ) +
-            (kDoFp8StaticQuant ? "_squant" : "" );
+            (pn.empty() ? "_npad" : "_" + pn) +
+            (kStoreLSE ? "_lse" : "_nlse" ) +
+            (kDoFp8StaticQuant ? "_squant" : "_nsquant" );
         #undef _SS_
         #undef _TS_
         // clang-format on
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
index 14d0596287..ea1762abc1 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
@@ -94,9 +94,10 @@ struct FmhaFwdSplitKVKernel
             "w" + _TS_(g0wt::at(ck_tile::number<0>{})) + "x" + _TS_(g0wt::at(ck_tile::number<1>{})) + "x" + _TS_(g0wt::at(ck_tile::number<2>{})) + "_" +
             "w" + _TS_(g1wt::at(ck_tile::number<0>{})) + "x" + _TS_(g1wt::at(ck_tile::number<1>{})) + "x" + _TS_(g1wt::at(ck_tile::number<2>{})) + "_" +
             (kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) + _SS_(FmhaPipeline::name) + "_" +
-            "v" + (std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor> ? "r" : "c") + (pn.empty() ? "" : "_" + pn) +
-            (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) + 
-            (kHasMask ? "_" + _SS_(FmhaMask::name) : "") + (kStoreLSE ? "_lse" : "" ) + (kDoFp8StaticQuant ? "_squant" : "") + (kIsPagedKV ? "_pagedkv" : "" );
+            "v" + (std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor> ? "r" : "c") + (pn.empty() ? "_npad" : "_" + pn) +
+            (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("_nbias") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
+            (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kStoreLSE ? "_lse" : "_nlse" ) +
+            (kDoFp8StaticQuant ? "_squant" : "_nsquant") + (kIsPagedKV ? "_pagedkv" : "_npagedkv" );
         #undef _SS_
         #undef _TS_
         // clang-format on
@@ -562,7 +563,7 @@ struct FmhaFwdSplitKVKernel
             }
             if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
             {
-                batch_offset_bias = query_start * kargs.stride_bias + key_start;
+                batch_offset_bias = query_start * kargs.stride_bias;
             }
 
             batch_offset_lse_acc = query_start;
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp
index 3d53535b28..809c58f1d1 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp
@@ -44,6 +44,8 @@ struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
     static constexpr index_t kQKHeaddim    = BlockFmhaShape::kQKHeaddim;
     static constexpr index_t kSubQKHeaddim = BlockFmhaShape::kSubQKHeaddim;
 
+    static_assert(kSubQKHeaddim <= 256, "hdim bigger than 256 is not suitable for this pipeline!");
+
     static constexpr bool kIsGroupMode     = Problem::kIsGroupMode;
     static constexpr bool kPadSeqLenQ      = Problem::kPadSeqLenQ;
     static constexpr bool kPadSeqLenK      = Problem::kPadSeqLenK;
@@ -97,6 +99,10 @@ struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
             {
                 return 1;
             }
+            else
+            {
+                return 1;
+            }
         }
     }();
 
@@ -316,11 +322,11 @@ struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
 
         // load Q from LDS
         __builtin_amdgcn_sched_barrier(0);
-        auto q_lds_window_for_load = make_tile_window(
-            q_lds,
-            Policy::template MakeQLdsBlockDescriptor<Problem>().get_lengths(),
-            {0, 0},
-            Policy::template MakeQRegTileDistribution<Problem, decltype(gemm_0)>());
+        auto q_lds_window_for_load =
+            make_tile_window(q_lds,
+                             Policy::template MakeQLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0},
+                             Policy::template MakeQRegTileDistribution<Problem>());
         block_sync_lds();
         auto q = load_tile(q_lds_window_for_load);
         __builtin_amdgcn_sched_barrier(0);
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs_default_policy.hpp
index 74d755ef39..9d8f6bc99f 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs_default_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs_default_policy.hpp
@@ -13,14 +13,12 @@ namespace ck_tile {
 // This pipeline is qkv all located in LDS
 struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVSDefaultPolicy
     : BlockFmhaPipelineQXKSVSCustomPolicy</* QLoadOnce = */ true,
-                                          /* AsyncCopyK = */ false,
-                                          /* AsyncCopyV = */ false,
+                                          /* AsyncCopy = */ false,
                                           /* NumPrefetchK = */ 1,
                                           /* NumPrefetchV = */ 1>
 {
     using BasePolicy = BlockFmhaPipelineQXKSVSCustomPolicy</* QLoadOnce = */ true,
-                                                           /* AsyncCopyK = */ false,
-                                                           /* AsyncCopyV = */ false,
+                                                           /* AsyncCopy = */ false,
                                                            /* NumPrefetchK = */ 1,
                                                            /* NumPrefetchV = */ 1>;
 
@@ -76,10 +74,10 @@ struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVSDefaultPolicy
                                        sequence<0, 1>>{});
     }
 
-    template <typename Problem, typename BlockGemm>
+    template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeQRegTileDistribution()
     {
-        return BasePolicy::template MakeQDramTileDistribution<Problem, BlockGemm>();
+        return BasePolicy::template MakeQRegTileDistribution<Problem>();
     }
 
     template <typename Problem>
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
index 04aa85644d..ce80dba5eb 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
@@ -43,6 +43,8 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
     static constexpr index_t kQKHeaddim    = BlockFmhaShape::kQKHeaddim;
     static constexpr index_t kSubQKHeaddim = BlockFmhaShape::kSubQKHeaddim;
 
+    static_assert(kSubQKHeaddim <= 256, "hdim bigger than 256 is not suitable for this pipeline!");
+
     static constexpr bool kIsGroupMode     = Problem::kIsGroupMode;
     static constexpr bool kPadSeqLenQ      = Problem::kPadSeqLenQ;
     static constexpr bool kPadSeqLenK      = Problem::kPadSeqLenK;
@@ -96,6 +98,10 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
             {
                 return 1;
             }
+            else
+            {
+                return 1;
+            }
         }
     }();
 
@@ -180,11 +186,10 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
         constexpr auto gemm_0 = Policy::template GetQKBlockGemm<Problem>();
         constexpr auto gemm_1 = Policy::template GetKVBlockGemm<Problem>();
 
-        auto q_dram_window = make_tile_window(
-            q_dram_block_window_tmp.get_bottom_tensor_view(),
-            q_dram_block_window_tmp.get_window_lengths(),
-            q_dram_block_window_tmp.get_window_origin(),
-            Policy::template MakeQDramTileDistribution<Problem, decltype(gemm_0)>());
+        auto q_dram_window = make_tile_window(q_dram_block_window_tmp.get_bottom_tensor_view(),
+                                              q_dram_block_window_tmp.get_window_lengths(),
+                                              q_dram_block_window_tmp.get_window_origin(),
+                                              Policy::template MakeQRegTileDistribution<Problem>());
 
         auto q = load_tile(q_dram_window);
 
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs_default_policy.hpp
index b7f1f042ed..ccc4f23817 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs_default_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs_default_policy.hpp
@@ -11,8 +11,7 @@ namespace ck_tile {
 // This pipeline is qkv all located in LDS
 struct BlockFmhaFwdSplitKVPipelineQRKSVSDefaultPolicy
     : BlockFmhaPipelineQXKSVSCustomPolicy</* QLoadOnce = */ true,
-                                          /* AsyncCopyK = */ false,
-                                          /* AsyncCopyV = */ false,
+                                          /* AsyncCopy = */ false,
                                           /* NumPrefetchK = */ 1,
                                           /* NumPrefetchV = */ 1>
 {
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp
index a7e9287143..8a4a925b81 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp
@@ -45,6 +45,8 @@ struct BlockFmhaPipelineQRKSVS
     static constexpr index_t kQKHeaddim    = BlockFmhaShape::kQKHeaddim;
     static constexpr index_t kSubQKHeaddim = BlockFmhaShape::kSubQKHeaddim;
 
+    static_assert(kSubQKHeaddim <= 256, "hdim bigger than 256 is not suitable for this pipeline!");
+
     static constexpr bool kIsGroupMode = Problem::kIsGroupMode;
     static constexpr bool kPadSeqLenQ  = Problem::kPadSeqLenQ;
     static constexpr bool kPadSeqLenK  = Problem::kPadSeqLenK;
@@ -96,6 +98,10 @@ struct BlockFmhaPipelineQRKSVS
             {
                 return 1;
             }
+            else
+            {
+                return 1;
+            };
         }
     }();
 
@@ -178,11 +184,10 @@ struct BlockFmhaPipelineQRKSVS
         constexpr auto gemm_0 = Policy::template GetQKBlockGemm<Problem>();
         constexpr auto gemm_1 = Policy::template GetKVBlockGemm<Problem>();
 
-        auto q_dram_window = make_tile_window(
-            q_dram_block_window_tmp.get_bottom_tensor_view(),
-            q_dram_block_window_tmp.get_window_lengths(),
-            q_dram_block_window_tmp.get_window_origin(),
-            Policy::template MakeQDramTileDistribution<Problem, decltype(gemm_0)>());
+        auto q_dram_window = make_tile_window(q_dram_block_window_tmp.get_bottom_tensor_view(),
+                                              q_dram_block_window_tmp.get_window_lengths(),
+                                              q_dram_block_window_tmp.get_window_origin(),
+                                              Policy::template MakeQRegTileDistribution<Problem>());
 
         auto q = load_tile(q_dram_window);
 
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
index 173887513e..67354fc72d 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
@@ -46,6 +46,8 @@ struct BlockFmhaPipelineQRKSVSAsync
     static constexpr index_t kQKHeaddim    = BlockFmhaShape::kQKHeaddim;
     static constexpr index_t kSubQKHeaddim = BlockFmhaShape::kSubQKHeaddim;
 
+    static_assert(kSubQKHeaddim <= 256, "hdim bigger than 256 is not suitable for this pipeline!");
+
     static constexpr bool kIsGroupMode = Problem::kIsGroupMode;
     // TODO: seq_q always support padding, hdim_q/v support multiple of vector(like 8x)
     //       only need special care about seq_k padding (oob need set -INF of p instead of zero)
@@ -110,10 +112,21 @@ struct BlockFmhaPipelineQRKSVSAsync
                 else
                     return 2;
             }
+            else if constexpr(kQKHeaddim <= 192)
+            {
+                if constexpr(kPadSeqLenK && BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                    return 1;
+                else
+                    return 2;
+            }
             else if constexpr(kQKHeaddim <= 256)
             {
                 return 1;
             }
+            else
+            {
+                return 1;
+            };
         }
     }();
 
@@ -189,19 +202,8 @@ struct BlockFmhaPipelineQRKSVSAsync
                     Policy::template MakeKLdsStoreBlockDescriptor<Problem>(i_buf).get_lengths(),
                     {0, 0, 0});
             },
-            number<Policy::NumPrefetchK>{});
+            number<Policy::NumKVLdsBuffers>{});
 
-#if K_LDS_LOAD_USE_OFFSET_TRANSFORM
-        auto k_lds_load = generate_tuple(
-            [&](auto i_buf) {
-                return make_tile_window(
-                    make_tensor_view<address_space_enum::lds>(
-                        k_lds_ptr, Policy::template MakeKLdsLoadBlockDescriptor<Problem>(i_buf)),
-                    Policy::template MakeKLdsLoadBlockDescriptor<Problem>(i_buf).get_lengths(),
-                    {0, 0});
-            },
-            number<Policy::NumPrefetchK>{});
-#else
         auto k_lds_Load_view = make_tensor_view<address_space_enum::lds>(
             k_lds_ptr, Policy::template MakeKLdsLoadBlockDescriptor<Problem>());
 
@@ -209,7 +211,6 @@ struct BlockFmhaPipelineQRKSVSAsync
             make_tile_window(k_lds_Load_view,
                              Policy::template MakeKLdsLoadBlockDescriptor<Problem>().get_lengths(),
                              {0, 0});
-#endif
 
         // V tile in LDS
         auto v_lds = make_tensor_view<address_space_enum::lds>(
@@ -222,11 +223,10 @@ struct BlockFmhaPipelineQRKSVSAsync
         constexpr auto gemm_0 = Policy::template GetQKBlockGemm<Problem>();
         constexpr auto gemm_1 = Policy::template GetKVBlockGemm<Problem>();
 
-        auto q_dram_window = make_tile_window(
-            q_dram_block_window_tmp.get_bottom_tensor_view(),
-            q_dram_block_window_tmp.get_window_lengths(),
-            q_dram_block_window_tmp.get_window_origin(),
-            Policy::template MakeQDramTileDistribution<Problem, decltype(gemm_0)>());
+        auto q_dram_window = make_tile_window(q_dram_block_window_tmp.get_bottom_tensor_view(),
+                                              q_dram_block_window_tmp.get_window_lengths(),
+                                              q_dram_block_window_tmp.get_window_origin(),
+                                              Policy::template MakeQRegTileDistribution<Problem>());
         q_dram_window.init_raw();
 
         // TODO: we use async Copy for K, which is inline asm
@@ -368,14 +368,9 @@ struct BlockFmhaPipelineQRKSVSAsync
                     gemm_0(s_acc,
                            get_slice_tile(
                                q, sequence<0, i_k0 * kK0>{}, sequence<kM0, (i_k0 + 1) * kK0>{}),
-#if K_LDS_LOAD_USE_OFFSET_TRANSFORM
-                           k_lds_load[number<LdsSeq.at(number<i_k0>{})>{}]);
-
-#else
                            get_slice_tile(k_lds_load,
                                           sequence<(LdsSeq.at(number<i_k0>{})) * kN0, 0>{},
                                           sequence<(LdsSeq.at(number<i_k0>{}) + 1) * kN0, kK0>{}));
-#endif
                 });
             }
 
@@ -391,18 +386,13 @@ struct BlockFmhaPipelineQRKSVSAsync
             auto v_buf           = load_tile(v_dram_window, number<-1>{}, bool_constant<false>{});
             __builtin_amdgcn_sched_barrier(0);
             { // tail
-                gemm_0(s_acc,
-                       get_slice_tile(
-                           q, sequence<0, (k0_loops - 1) * kK0>{}, sequence<kM0, k0_loops * kK0>{}),
-#if K_LDS_LOAD_USE_OFFSET_TRANSFORM
-                       k_lds_load[number<LdsSeq.at(number<k0_loops - 1>{})>{}]);
-
-#else
-                       get_slice_tile(
-                           k_lds_load,
-                           sequence<(LdsSeq.at(number<k0_loops - 1>{})) * kN0, 0>{},
-                           sequence<(LdsSeq.at(number<k0_loops - 1>{}) + 1) * kN0, kK0>{}));
-#endif
+                gemm_0(
+                    s_acc,
+                    get_slice_tile(
+                        q, sequence<0, (k0_loops - 1) * kK0>{}, sequence<kM0, k0_loops * kK0>{}),
+                    get_slice_tile(k_lds_load,
+                                   sequence<(LdsSeq.at(number<k0_loops - 1>{})) * kN0, 0>{},
+                                   sequence<(LdsSeq.at(number<k0_loops - 1>{}) + 1) * kN0, kK0>{}));
             }
             __builtin_amdgcn_sched_barrier(1);
 
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_default_policy.hpp
index 7824bbdefb..e92ba58b37 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_default_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_default_policy.hpp
@@ -11,8 +11,7 @@ namespace ck_tile {
 // This pipeline is qkv all located in LDS
 using BlockFmhaPipelineQRKSVSAsyncDefaultPolicy =
     BlockFmhaPipelineQXKSVSCustomPolicy</* QLoadOnce = */ true,
-                                        /* AsyncCopyK = */ true,
-                                        /* AsyncCopyV = */ false,
+                                        /* AsyncCopy = */ true,
                                         /* NumPrefetchK = */ 3,
                                         /* NumPrefetchV = */ 3>;
 
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_default_policy.hpp
index 6ce4591aff..e905037398 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_default_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_default_policy.hpp
@@ -8,11 +8,9 @@
 
 namespace ck_tile {
 
-// This pipeline is qkv all located in LDS
 using BlockFmhaPipelineQRKSVSDefaultPolicy =
     BlockFmhaPipelineQXKSVSCustomPolicy</* QLoadOnce = */ true,
-                                        /* AsyncCopyK = */ false,
-                                        /* AsyncCopyV = */ false,
+                                        /* AsyncCopy = */ false,
                                         /* NumPrefetchK = */ 1,
                                         /* NumPrefetchV = */ 1>;
 
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch.hpp
new file mode 100644
index 0000000000..cc532040e8
--- /dev/null
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch.hpp
@@ -0,0 +1,929 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/fmha/block/block_attention_bias_enum.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch_default_policy.hpp"
+#include "ck_tile/ops/fmha/block/block_dropout.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce.hpp"
+
+namespace ck_tile {
+
+template <typename Problem_, typename Policy_ = BlockFmhaPipelineQRKSVSWholeKPrefetchDefaultPolicy>
+struct BlockFmhaPipelineQRKSVSWholeKPrefetch
+{
+    using Problem               = remove_cvref_t<Problem_>;
+    using Policy                = remove_cvref_t<Policy_>;
+    using QDataType             = remove_cvref_t<typename Problem::QDataType>;
+    using KDataType             = remove_cvref_t<typename Problem::KDataType>;
+    using VDataType             = remove_cvref_t<typename Problem::VDataType>;
+    using SaccDataType          = remove_cvref_t<typename Problem::SaccDataType>;
+    using SMPLComputeDataType   = remove_cvref_t<typename Problem::SMPLComputeDataType>;
+    using BiasDataType          = remove_cvref_t<typename Problem::BiasDataType>;
+    using RandValOutputDataType = remove_cvref_t<typename Problem::RandValOutputDataType>;
+    using LSEDataType           = remove_cvref_t<typename Problem::LSEDataType>;
+    using PDataType             = remove_cvref_t<typename Problem::PDataType>;
+    using OaccDataType          = remove_cvref_t<typename Problem::OaccDataType>;
+    using ODataType             = remove_cvref_t<typename Problem::ODataType>;
+    using FmhaMask              = remove_cvref_t<typename Problem::FmhaMask>;
+
+    using BlockFmhaShape             = remove_cvref_t<typename Problem::BlockFmhaShape>;
+    using VLayout                    = remove_cvref_t<typename BlockFmhaShape::VLayout>;
+    static constexpr bool kQLoadOnce = true;
+    static_assert(kQLoadOnce == Policy::QLoadOnce);
+
+    static constexpr index_t kBlockSize = Problem::kBlockSize;
+
+    static constexpr index_t kM0           = BlockFmhaShape::kM0;
+    static constexpr index_t kN0           = BlockFmhaShape::kN0;
+    static constexpr index_t kK0           = BlockFmhaShape::kK0;
+    static constexpr index_t kN1           = BlockFmhaShape::kN1;
+    static constexpr index_t kK1           = BlockFmhaShape::kK1;
+    static constexpr index_t kQKHeaddim    = BlockFmhaShape::kQKHeaddim;
+    static constexpr index_t kSubQKHeaddim = BlockFmhaShape::kSubQKHeaddim;
+
+    static_assert(kSubQKHeaddim <= 256, "hdim bigger than 256 is not suitable for this pipeline!");
+
+    static constexpr bool kIsGroupMode = Problem::kIsGroupMode;
+    static constexpr bool kPadSeqLenQ  = Problem::kPadSeqLenQ;
+    static constexpr bool kPadSeqLenK  = Problem::kPadSeqLenK;
+    static constexpr bool kPadHeadDimQ = Problem::kPadHeadDimQ;
+    static constexpr bool kPadHeadDimV = (kQKHeaddim < kSubQKHeaddim) ? 1 : Problem::kPadHeadDimV;
+    static constexpr auto BiasEnum     = Problem::BiasEnum;
+    static constexpr bool kStoreLSE    = Problem::kStoreLSE;
+    static constexpr bool kHasDropout  = Problem::kHasDropout;
+
+    // last dimension vector length used to create tensor view(and decide buffer_load vector length)
+    // ... together with tensor distribution. tensor dist should able to overwrite this
+    static constexpr index_t kAlignmentQ =
+        kPadHeadDimQ ? 1 : Policy::template GetAlignmentQ<Problem>();
+    static constexpr index_t kAlignmentK =
+        kPadHeadDimQ ? 1 : Policy::template GetAlignmentK<Problem>();
+    static constexpr index_t kAlignmentV = []() {
+        if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+            return Problem::kPadHeadDimV ? 1 : Policy::template GetAlignmentV<Problem>();
+        else
+            return kPadSeqLenK ? 1 : Policy::template GetAlignmentV<Problem>();
+    }();
+
+    static constexpr index_t kAlignmentO =
+        kPadHeadDimV ? 1 : Policy::template GetAlignmentO<Problem>();
+    static constexpr index_t kAlignmentBias =
+        kPadSeqLenK ? 1 : Policy::template GetAlignmentBias<Problem>();
+
+    static constexpr index_t kBlockPerCu = []() {
+        if constexpr(Problem::kBlockPerCu != -1)
+            return Problem::kBlockPerCu;
+        else
+        {
+            if constexpr(kQKHeaddim == 32)
+            {
+                return 2;
+            }
+            else if constexpr(kQKHeaddim == 64)
+            {
+                return 2;
+            }
+            else if constexpr(kQKHeaddim == 96 || kQKHeaddim == 128)
+            {
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                    return 1;
+                else
+                    return 2;
+            }
+            else if constexpr(kQKHeaddim == 256)
+            {
+                return 1;
+            }
+            else
+            {
+                return 1;
+            };
+        }
+    }();
+
+    static constexpr const char* name = "qr_async";
+
+    using DropoutType = std::conditional_t<kHasDropout, BlockDropout, NullBlockDropout>;
+
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    template <typename QDramBlockWindowTmp,
+              typename KDramBlockWindowTmp,
+              typename VDramBlockWindowTmp,
+              typename BiasDramBlockWindowTmp,
+              typename RandValDramBlockWindowTmp,
+              typename LSEDramBlockWindowTmp,
+              typename QElementFunction,
+              typename KElementFunction,
+              typename VElementFunction,
+              typename BiasElementFunction,
+              typename LSEElementFunction,
+              typename SAccElementFunction,
+              typename PComputeElementFunction,
+              typename OAccElementFunction,
+              typename PositionEncoding>
+    CK_TILE_HOST_DEVICE auto
+    operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp, // M0*kSubQKHeaddim tile
+               const QElementFunction& q_element_func,
+               const KDramBlockWindowTmp& k_dram_block_window_tmp, // N0*kSubQKHeaddim tile
+               const KElementFunction& k_element_func,
+               const VDramBlockWindowTmp& v_dram_block_window_tmp, // N1*K1 tile
+               const VElementFunction& v_element_func,
+               const BiasDramBlockWindowTmp& bias_dram_block_window_tmp, // M0*N0 tile
+               const BiasElementFunction& bias_element_func,
+               RandValDramBlockWindowTmp& randval_dram_block_window_tmp,
+               LSEDramBlockWindowTmp& lse_dram_window_tmp, // M0*1 tile
+               const LSEElementFunction& lse_element_func,
+               const SAccElementFunction& s_acc_element_func,
+               const PComputeElementFunction& p_compute_element_func,
+               const OAccElementFunction& o_acc_element_func,
+               FmhaMask mask,
+               PositionEncoding position_encoding,
+               float scale_s,
+               void* smem_ptr,
+               DropoutType& dropout) const
+    {
+        ignore = q_element_func;
+        ignore = k_element_func;
+
+        static_assert(
+            std::is_same_v<QDataType, remove_cvref_t<typename QDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<KDataType, remove_cvref_t<typename KDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<VDataType, remove_cvref_t<typename VDramBlockWindowTmp::DataType>>,
+            "wrong!");
+
+        static_assert(kM0 == QDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kN0 == KDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kK0 == KDramBlockWindowTmp{}.get_window_lengths()[number<1>{}] &&
+                          kN1 == VDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kK1 == VDramBlockWindowTmp{}.get_window_lengths()[number<1>{}] &&
+                          kM0 == BiasDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kN0 == BiasDramBlockWindowTmp{}.get_window_lengths()[number<1>{}],
+                      "wrong!");
+
+        constexpr auto I0 = number<0>{};
+        constexpr auto I1 = number<1>{};
+
+        constexpr index_t k0_loops = kQKHeaddim / kK0;
+        constexpr index_t k1_loops = kN0 / kK1;
+        static_assert(2 <= k0_loops);
+        static_assert(2 <= k1_loops);
+
+        constexpr bool kPreloadWholeNextIterationK =
+            Policy::template IsPreloadWholeNextIterationK<Problem>();
+
+        constexpr auto NumKLdsBuffers = Policy::template GetNumKLdsBuffers<Problem>();
+        constexpr auto NumVLdsBuffers = Policy::template GetNumVLdsBuffers<Problem>();
+        constexpr auto NumPrefetchV   = Policy::template GetNumPrefetchV<Problem>();
+
+        static_assert(NumKLdsBuffers >= 2);
+
+        auto q_dram_window = make_tile_window(q_dram_block_window_tmp.get_bottom_tensor_view(),
+                                              q_dram_block_window_tmp.get_window_lengths(),
+                                              q_dram_block_window_tmp.get_window_origin(),
+                                              Policy::template MakeQRegTileDistribution<Problem>());
+
+        const auto q_origin = q_dram_window.get_window_origin();
+        const auto [seqlen_k_start, seqlen_k_end] =
+            mask.GetTileRangeAlongX(q_origin.at(number<0>{}), number<kM0>{}, number<kN0>{});
+
+        auto k_dram_block_window =
+            make_tile_window(k_dram_block_window_tmp.get_bottom_tensor_view(),
+                             k_dram_block_window_tmp.get_window_lengths(),
+                             {seqlen_k_start, 0});
+
+        auto k_dram_window =
+            make_tile_window(k_dram_block_window.get_bottom_tensor_view(),
+                             k_dram_block_window.get_window_lengths(),
+                             k_dram_block_window.get_window_origin(),
+                             Policy::template MakeKDramTileDistribution<Problem>());
+
+        using k_tile_type = decltype(load_tile(k_dram_window));
+
+        auto k_tiles = [&]() {
+            if constexpr(kPreloadWholeNextIterationK)
+                return statically_indexed_array<k_tile_type, k0_loops>{};
+            else
+                return statically_indexed_array<k_tile_type, 1>{};
+        }();
+
+        k_tiles[I0] = load_tile(k_dram_window);
+        move_tile_window(k_dram_window, {0, kK0});
+
+        auto q_tile = load_tile(q_dram_window);
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        // K tile in LDS
+        KDataType* k_lds_ptr = static_cast<KDataType*>(smem_ptr);
+        auto k_lds           = make_tensor_view<address_space_enum::lds>(
+            k_lds_ptr, Policy::template MakeKLdsBlockDescriptor<Problem>());
+        auto k_lds_window = make_tile_window(
+            k_lds, Policy::template MakeKLdsBlockDescriptor<Problem>().get_lengths(), {0, 0});
+
+        using k_lds_window_type =
+            decltype(get_slice_tile(k_lds_window, sequence<0, 0>{}, sequence<kN0, kK0>{}));
+
+        statically_indexed_array<k_lds_window_type, NumKLdsBuffers> k_lds_windows;
+
+        static_for<0, NumKLdsBuffers, 1>{}([&](auto i_buf) {
+            k_lds_windows[i_buf] = get_slice_tile(
+                k_lds_window, sequence<i_buf * kN0, 0>{}, sequence<(i_buf + 1) * kN0, kK0>{});
+        });
+
+        auto v_dram_window =
+            make_tile_window(v_dram_block_window_tmp.get_bottom_tensor_view(),
+                             v_dram_block_window_tmp.get_window_lengths(),
+                             {0, seqlen_k_start}, // TODO: hdim split?
+                             Policy::template MakeVDramTileDistribution<Problem>());
+        // V tile in LDS
+        auto v_lds = make_tensor_view<address_space_enum::lds>(
+            reinterpret_cast<VDataType*>(static_cast<char*>(smem_ptr) +
+                                         Policy::template GetExclusiveKLdsBytes<Problem>()),
+            Policy::template MakeVLdsBlockDescriptor<Problem>());
+        auto v_lds_window = make_tile_window(
+            v_lds, Policy::template MakeVLdsBlockDescriptor<Problem>().get_lengths(), {0, 0});
+
+        using v_tile_type = decltype(load_tile(v_dram_window));
+
+        statically_indexed_array<v_tile_type, NumPrefetchV> v_tiles;
+
+        using v_lds_window_type =
+            decltype(get_slice_tile(v_lds_window, sequence<0, 0>{}, sequence<kN1, kK1>{}));
+
+        statically_indexed_array<v_lds_window_type, NumVLdsBuffers> v_lds_windows;
+
+        static_for<0, NumVLdsBuffers, 1>{}([&](auto i_buf) {
+            v_lds_windows[i_buf] = get_slice_tile(
+                v_lds_window, sequence<i_buf * kN1, 0>{}, sequence<(i_buf + 1) * kN1, kK1>{});
+        });
+
+        // Block GEMM
+        constexpr auto gemm_0 = Policy::template GetQKBlockGemm<Problem>();
+        constexpr auto gemm_1 = Policy::template GetKVBlockGemm<Problem>();
+
+        using SaccBlockTileType = decltype(gemm_0.MakeCBlockTile());
+        auto s_acc              = SaccBlockTileType{};
+
+        // reduction function for softmax
+        const auto f_max = [](auto e0, auto e1) { return max(e0, e1); };
+        const auto f_sum = [](auto e0, auto e1) { return e0 + e1; };
+
+        // infer Sacc, S, P, M, L, Oacc type
+        using SBlockTileType = decltype(cast_tile<SMPLComputeDataType>(s_acc));
+
+        using MLBlockTileType = decltype(block_tile_reduce<SMPLComputeDataType>(
+            SBlockTileType{}, sequence<1>{}, f_max, SMPLComputeDataType{0}));
+
+        using OaccBlockTileType = decltype(gemm_1.MakeCBlockTile());
+
+        // init Oacc, M, L
+        auto o_acc = OaccBlockTileType{};
+        auto m     = MLBlockTileType{};
+        auto l     = MLBlockTileType{};
+
+        clear_tile(o_acc);
+        set_tile(m, -numeric<SMPLComputeDataType>::infinity());
+        clear_tile(l);
+
+        const auto num_total_loop = integer_divide_ceil(seqlen_k_end - seqlen_k_start, kN0);
+
+        // check early exit if no work to do
+        if constexpr(FmhaMask::IsMasking || kPadSeqLenK)
+        {
+            if(num_total_loop <= 0)
+            {
+                if constexpr(kStoreLSE)
+                {
+                    auto lse =
+                        make_static_distributed_tensor<LSEDataType>(m.get_tile_distribution());
+
+                    set_tile(lse, -numeric<SMPLComputeDataType>::infinity());
+
+                    store_tile(lse_dram_window_tmp, tile_elementwise_in(lse_element_func, lse));
+                }
+
+                // Note: here occ are all cleard, return it
+                // Note: q loaded but no fence, ignore it.
+                return o_acc;
+            }
+        }
+
+        const auto bias_origin = bias_dram_block_window_tmp.get_window_origin();
+        auto bias_dram_window =
+            make_tile_window(bias_dram_block_window_tmp.get_bottom_tensor_view(),
+                             bias_dram_block_window_tmp.get_window_lengths(),
+                             {bias_origin.at(number<0>{}), seqlen_k_start}, // M/N
+                             Policy::template MakeBiasDramTileDistribution<decltype(gemm_0)>());
+
+        auto randval_dram_window = dropout.template MakeRandvalDramWindow<decltype(gemm_0)>(
+            randval_dram_block_window_tmp, seqlen_k_start);
+
+        q_tile = tile_elementwise_in(q_element_func, q_tile);
+
+        index_t i_total_loops = 0;
+
+        do
+        {
+            if constexpr(kPreloadWholeNextIterationK)
+            {
+                if(i_total_loops == 0) // executed by fist iteration
+                {
+                    if(num_total_loop > 1) // there are multiple iterations
+                    {
+                        static_for<0, k0_loops - 1, 1>{}([&](auto i_k0) {
+                            store_tile(
+                                k_lds_windows[number<i_k0 % NumKLdsBuffers>{}],
+                                tile_elementwise_in(k_element_func, k_tiles[number<i_k0>{}]));
+
+                            k_tiles[number<i_k0 + 1>{}] = load_tile(k_dram_window);
+                            if constexpr(i_k0 < k0_loops - 2)
+                                move_tile_window(k_dram_window, {0, kK0});
+
+                            if constexpr(i_k0 == 0)
+                                clear_tile(s_acc);
+
+                            block_sync_lds();
+                            // execute current unroll of gemm_0
+                            gemm_0(s_acc,
+                                   get_slice_tile(q_tile,
+                                                  sequence<0, i_k0 * kK0>{},
+                                                  sequence<kM0, (i_k0 + 1) * kK0>{}),
+                                   k_lds_windows[number<i_k0 % NumKLdsBuffers>{}]);
+                        });
+
+                        store_tile(
+                            k_lds_windows[number<(k0_loops - 1) % NumKLdsBuffers>{}],
+                            tile_elementwise_in(k_element_func, k_tiles[number<k0_loops - 1>{}]));
+
+                        // prefetch first v_tile
+                        v_tiles[I0] = load_tile(v_dram_window);
+                        move_tile_window(v_dram_window, {0, kK1});
+
+                        move_tile_window(k_dram_window, {kN0, -(k0_loops - 1) * kK0});
+
+                        // prefetch all k_tiles for next iteration
+                        static_for<0, k0_loops, 1>{}([&](auto i_k0) {
+                            k_tiles[number<i_k0>{}] = load_tile(k_dram_window);
+
+                            if constexpr(i_k0 < k0_loops - 1)
+                                move_tile_window(k_dram_window, {0, kK0});
+                        });
+
+                        move_tile_window(k_dram_window, {0, -(k0_loops - 1) * kK0});
+
+                        block_sync_lds();
+                        // execute last unroll of gemm_0
+                        gemm_0(s_acc,
+                               get_slice_tile(q_tile,
+                                              sequence<0, (k0_loops - 1) * kK0>{},
+                                              sequence<kM0, k0_loops * kK0>{}),
+                               k_lds_windows[number<(k0_loops - 1) % NumKLdsBuffers>{}]);
+                    }
+                    else // there is only single iteration
+                    {
+                        static_for<0, k0_loops - 1, 1>{}([&](auto i_k0) {
+                            store_tile(
+                                k_lds_windows[number<i_k0 % NumKLdsBuffers>{}],
+                                tile_elementwise_in(k_element_func, k_tiles[number<i_k0>{}]));
+
+                            k_tiles[number<i_k0 + 1>{}] = load_tile(k_dram_window);
+                            if constexpr(i_k0 < k0_loops - 2)
+                                move_tile_window(k_dram_window, {0, kK0});
+
+                            if constexpr(i_k0 == 0)
+                                clear_tile(s_acc);
+
+                            block_sync_lds();
+                            // execute current unroll of gemm_0
+                            gemm_0(s_acc,
+                                   get_slice_tile(q_tile,
+                                                  sequence<0, i_k0 * kK0>{},
+                                                  sequence<kM0, (i_k0 + 1) * kK0>{}),
+                                   k_lds_windows[number<i_k0 % NumKLdsBuffers>{}]);
+                        });
+
+                        store_tile(
+                            k_lds_windows[number<(k0_loops - 1) % NumKLdsBuffers>{}],
+                            tile_elementwise_in(k_element_func, k_tiles[number<k0_loops - 1>{}]));
+
+                        // prefetch first v_tile
+                        v_tiles[I0] = load_tile(v_dram_window);
+                        move_tile_window(v_dram_window, {0, kK1});
+
+                        block_sync_lds();
+                        gemm_0(s_acc,
+                               get_slice_tile(q_tile,
+                                              sequence<0, (k0_loops - 1) * kK0>{},
+                                              sequence<kM0, k0_loops * kK0>{}),
+                               k_lds_windows[number<(k0_loops - 1) % NumKLdsBuffers>{}]);
+
+                        // move_tile_window(k_dram_window, {0, -k0_loops * kK0});
+                    }
+                }
+                else // executed by intermediate and last iteration
+                {
+                    if(i_total_loops < num_total_loop - 1) // intermediate iteration
+                    {
+                        store_tile(k_lds_windows[I0],
+                                   tile_elementwise_in(k_element_func, k_tiles[I0]));
+
+                        // prefetch first v_tile
+                        v_tiles[I0] = load_tile(v_dram_window);
+                        move_tile_window(v_dram_window, {0, kK1});
+
+                        clear_tile(s_acc);
+                        block_sync_lds();
+                        gemm_0(s_acc,
+                               get_slice_tile(q_tile, sequence<0, 0>{}, sequence<kM0, kK0>{}),
+                               k_lds_windows[I0]);
+
+                        store_tile(k_lds_windows[I1],
+                                   tile_elementwise_in(k_element_func, k_tiles[I1]));
+
+                        move_tile_window(k_dram_window, {kN0, 0});
+
+                        // prefetch first k_tile for next iteration
+                        k_tiles[I0] = load_tile(k_dram_window);
+                        move_tile_window(k_dram_window, {0, kK0});
+
+                        k_tiles[I1] = load_tile(k_dram_window);
+                        if constexpr(1 < k0_loops - 1)
+                            move_tile_window(k_dram_window, {0, kK0});
+
+                        block_sync_lds();
+                        gemm_0(s_acc,
+                               get_slice_tile(q_tile, sequence<0, kK0>{}, sequence<kM0, 2 * kK0>{}),
+                               k_lds_windows[I1]);
+
+                        // during the gemm-loop, also prefetch other k_tiles for next iteration
+                        static_for<2, k0_loops, 1>{}([&](auto i_k0) {
+                            store_tile(k_lds_windows[number<i_k0 % NumKLdsBuffers>{}],
+                                       k_tiles[number<i_k0>{}]);
+
+                            k_tiles[number<i_k0>{}] = load_tile(k_dram_window);
+                            if constexpr(i_k0 < k0_loops - 1)
+                                move_tile_window(k_dram_window, {0, kK0});
+
+                            block_sync_lds();
+                            gemm_0(s_acc,
+                                   get_slice_tile(q_tile,
+                                                  sequence<0, i_k0 * kK0>{},
+                                                  sequence<kM0, (i_k0 + 1) * kK0>{}),
+                                   k_lds_windows[number<i_k0 % NumKLdsBuffers>{}]);
+                        });
+
+                        move_tile_window(k_dram_window, {0, -(k0_loops - 1) * kK0});
+                    }
+                    else // last iteration
+                    {
+                        store_tile(k_lds_windows[I0],
+                                   tile_elementwise_in(k_element_func, k_tiles[I0]));
+
+                        // prefetch first v_tile
+                        v_tiles[I0] = load_tile(v_dram_window);
+                        move_tile_window(v_dram_window, {0, kK1});
+
+                        clear_tile(s_acc);
+                        block_sync_lds();
+                        gemm_0(s_acc,
+                               get_slice_tile(q_tile, sequence<0, 0>{}, sequence<kM0, kK0>{}),
+                               k_lds_windows[I0]);
+
+                        static_for<1, k0_loops, 1>{}([&](auto i_k0) {
+                            store_tile(
+                                k_lds_windows[number<i_k0 % NumKLdsBuffers>{}],
+                                tile_elementwise_in(k_element_func, k_tiles[number<i_k0>{}]));
+
+                            block_sync_lds();
+                            gemm_0(s_acc,
+                                   get_slice_tile(q_tile,
+                                                  sequence<0, i_k0 * kK0>{},
+                                                  sequence<kM0, (i_k0 + 1) * kK0>{}),
+                                   k_lds_windows[number<i_k0 % NumKLdsBuffers>{}]);
+                        });
+                    };
+                };
+            }
+            else // only preload one unroll of K for next iteration
+            {
+                static_for<0, k0_loops - 1, 1>{}([&](auto i_k0) {
+                    store_tile(k_lds_windows[number<i_k0 % NumKLdsBuffers>{}],
+                               tile_elementwise_in(k_element_func, k_tiles[I0]));
+                    if constexpr(i_k0 == 0)
+                        clear_tile(s_acc);
+
+                    if constexpr(i_k0 < k0_loops - 1)
+                        k_tiles[I0] = load_tile(k_dram_window);
+                    if constexpr(i_k0 < k0_loops - 2)
+                        move_tile_window(k_dram_window, {0, kK0});
+
+                    block_sync_lds();
+                    // execute current unroll of gemm_0
+                    gemm_0(s_acc,
+                           get_slice_tile(q_tile,
+                                          sequence<0, i_k0 * kK0>{},
+                                          sequence<kM0, (i_k0 + 1) * kK0>{}),
+                           k_lds_windows[number<i_k0 % NumKLdsBuffers>{}]);
+                });
+
+                store_tile(k_lds_windows[number<(k0_loops - 1) % NumKLdsBuffers>{}],
+                           tile_elementwise_in(k_element_func, k_tiles[I0]));
+
+                // prefetch first v_tile
+                v_tiles[I0] = load_tile(v_dram_window);
+                move_tile_window(v_dram_window, {0, kK1});
+
+                block_sync_lds();
+                gemm_0(s_acc,
+                       get_slice_tile(q_tile,
+                                      sequence<0, (k0_loops - 1) * kK0>{},
+                                      sequence<kM0, k0_loops * kK0>{}),
+                       k_lds_windows[number<(k0_loops - 1) % NumKLdsBuffers>{}]);
+            };
+
+            __builtin_amdgcn_sched_barrier(0);
+
+            const auto bias_tile = load_tile(bias_dram_window); // load bias tile
+
+            static_for<1, NumPrefetchV, 1>{}([&](auto i_buf) {
+                v_tiles[i_buf] = load_tile(v_dram_window);
+                move_tile_window(v_dram_window, {0, kK1});
+            });
+
+            // STAGE 2, scale_s, add bias, mask, softmax
+            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+            {
+                s_acc = tile_elementwise_in(s_acc_element_func, s_acc);
+                tile_elementwise_inout([&scale_s](auto& x) { x = x * scale_s; }, s_acc);
+                tile_elementwise_inout(
+                    [&](auto& x, const auto& y) {
+#if !CK_TILE_FMHA_FWD_FAST_EXP2
+                        x += type_convert<SaccDataType>(bias_element_func(y));
+#else
+                        x += log2e_v<SaccDataType> *
+                             type_convert<SaccDataType>(bias_element_func(y));
+#endif
+                    },
+                    s_acc,
+                    bias_tile);
+            }
+            else if constexpr(BiasEnum == BlockAttentionBiasEnum::ALIBI)
+            {
+                const auto k_origin    = k_dram_block_window.get_window_origin();
+                constexpr auto s_spans = decltype(s_acc)::get_distributed_spans();
+                s_acc                  = tile_elementwise_in(s_acc_element_func, s_acc);
+                sweep_tile_span(s_spans[number<0>{}], [&](auto idx0) {
+                    sweep_tile_span(s_spans[number<1>{}], [&](auto idx1) {
+                        const auto tile_idx = get_x_indices_from_distributed_indices(
+                            s_acc.get_tile_distribution(), make_tuple(idx0, idx1));
+
+                        const auto row = q_origin.at(number<0>{}) + tile_idx.at(number<0>{});
+                        const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
+                        constexpr auto i_j_idx = make_tuple(idx0, idx1);
+
+                        s_acc(i_j_idx) *= scale_s;
+                        position_encoding.update(s_acc(i_j_idx), row, col);
+                    });
+                });
+            }
+            else
+            {
+                s_acc = tile_elementwise_in(s_acc_element_func, s_acc);
+#if !CK_TILE_FMHA_FWD_FAST_EXP2
+                tile_elementwise_inout([&scale_s](auto& x) { x = x * scale_s; }, s_acc);
+#endif
+            }
+            move_tile_window(bias_dram_window, {0, kN0});
+            if constexpr(kPadSeqLenK || FmhaMask::IsMasking)
+            {
+                const auto k_origin      = k_dram_block_window.get_window_origin();
+                bool need_perpixel_check = mask.IsEdgeTile(q_origin.at(number<0>{}),
+                                                           k_origin.at(number<0>{}),
+                                                           number<kM0>{},
+                                                           number<kN0>{});
+                if(need_perpixel_check)
+                {
+                    set_tile_if(
+                        s_acc, -numeric<SMPLComputeDataType>::infinity(), [&](auto tile_idx) {
+                            const auto row = q_origin.at(number<0>{}) + tile_idx.at(number<0>{});
+                            const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
+                            return mask.IsOutOfBound(row, col);
+                        });
+                }
+            }
+
+            const auto s = cast_tile<SMPLComputeDataType>(s_acc); // S{j}
+            auto m_local = block_tile_reduce<SMPLComputeDataType>(
+                s,
+                sequence<1>{},
+                f_max,
+                -numeric<SMPLComputeDataType>::infinity()); // m_local = rowmax(S{j})
+            block_tile_reduce_sync(m_local, f_max, bool_constant<false>{});
+
+            const auto m_old = m; // m{j-1}
+            tile_elementwise_inout(
+                [](auto& e0, auto e1, auto e2) { e0 = max(e1, e2); }, m, m_old, m_local); // m{j}
+
+            auto p_compute = make_static_distributed_tensor<SMPLComputeDataType>(
+                s.get_tile_distribution()); // Pcompute{j}
+
+            static const auto get_validated_m = [](SMPLComputeDataType raw_m) {
+                /// NOTICE: bias might be materialized mask including -inf values, need
+                /// consideration
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                             FmhaMask::IsMasking)
+                {
+                    return raw_m == -numeric<SMPLComputeDataType>::infinity()
+                               ? type_convert<SMPLComputeDataType>(0.f)
+                               : raw_m;
+                }
+                else
+                {
+                    return raw_m;
+                }
+            };
+
+            constexpr auto p_spans = decltype(p_compute)::get_distributed_spans();
+            sweep_tile_span(p_spans[number<0>{}], [&](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                auto row_max = scale_s * get_validated_m(m[i_idx]);
+#endif
+                sweep_tile_span(p_spans[number<1>{}], [&](auto idx1) {
+                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                    if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                                 BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                    {
+                        p_compute(i_j_idx) = exp2(s[i_j_idx] - get_validated_m(m[i_idx]));
+                    }
+                    else
+                    {
+                        p_compute(i_j_idx) = exp2(scale_s * s[i_j_idx] - row_max);
+                    }
+#else
+                    p_compute(i_j_idx)     = exp(s[i_j_idx] - get_validated_m(m[i_idx]));
+#endif
+                });
+            });
+
+            auto rowsum_p = block_tile_reduce<SMPLComputeDataType>(
+                p_compute, sequence<1>{}, f_sum, SMPLComputeDataType{0}); // rowsum(Pcompute{j})
+
+            block_tile_reduce_sync(rowsum_p, f_sum, bool_constant<false>{});
+            // l{j}, Oacc{j}
+            constexpr auto o_spans = decltype(o_acc)::get_distributed_spans();
+            sweep_tile_span(o_spans[number<0>{}], [&](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                const auto tmp = [&]() {
+                    if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                                 BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                    {
+                        return exp2(m_old[i_idx] - get_validated_m(m[i_idx]));
+                    }
+                    else
+                    {
+                        auto row_max = scale_s * get_validated_m(m[i_idx]);
+                        return exp2(scale_s * m_old[i_idx] - row_max);
+                    }
+                }();
+#else
+                    const auto tmp = exp(m_old[i_idx] - get_validated_m(m[i_idx]));
+#endif
+                l(i_idx) = tmp * l[i_idx] + rowsum_p[i_idx];
+                sweep_tile_span(o_spans[number<1>{}], [&](auto idx1) {
+                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                    // FIXME: this use different equation from FA v2 paper,
+                    // but produce correc result.
+                    // Is the equation wrong?
+                    o_acc(i_j_idx) *= tmp;
+                });
+            });
+
+            if constexpr(kHasDropout)
+            {
+                auto randval_ptr =
+                    reinterpret_cast<char*>(smem_ptr) + Policy::template GetSmemSizeK<Problem>();
+                dropout.template Run<decltype(gemm_0), SMPLComputeDataType, RandValOutputDataType>(
+                    smem_ptr, seqlen_k_start + i_total_loops * kN0, p_compute, randval_dram_window);
+            }
+
+            __builtin_amdgcn_sched_barrier(0x7f);
+
+            if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+            {
+                auto v_shuffle_tmp = make_static_distributed_tensor<VDataType>(
+                    Policy::template MakeShuffledVRegBlockDescriptor<Problem>());
+                shuffle_tile(v_shuffle_tmp, v_tiles[I0]);
+
+                store_tile(
+                    v_lds_windows[I0],
+                    tile_elementwise_in(v_element_func, v_shuffle_tmp)); // store the prefetch
+            }
+            else
+            {
+                store_tile(v_lds_windows[I0],
+                           tile_elementwise_in(v_element_func, v_tiles[I0])); // store the prefetch
+            }
+
+            __builtin_amdgcn_sched_barrier(0);
+
+            const auto p =
+                cast_tile<PDataType>(tile_elementwise_in(p_compute_element_func, p_compute));
+
+            if constexpr(!kPreloadWholeNextIterationK)
+            {
+                if(i_total_loops < num_total_loop - 1)
+                {
+                    move_tile_window(k_dram_window, {kN0, -(k0_loops - 1) * kK0});
+                    k_tiles[I0] = load_tile(k_dram_window);
+                    move_tile_window(k_dram_window, {0, kK0});
+                };
+
+                __builtin_amdgcn_sched_barrier(0);
+            }
+
+            // STAGE 3, KV gemm
+            if constexpr(k1_loops > 1)
+            {
+                if constexpr(NumPrefetchV == 1) // NumVLdsBuffers == 2
+                {
+                    static_for<0, k1_loops - 1, 1>{}([&](auto i_k1) {
+                        v_tiles[I0] = load_tile(v_dram_window);
+
+                        block_sync_lds();
+                        gemm_1(o_acc,
+                               get_slice_tile(
+                                   p, sequence<0, i_k1 * kK1>{}, sequence<kM0, (i_k1 + 1) * kK1>{}),
+                               v_lds_windows[number<i_k1 % NumVLdsBuffers>{}]);
+
+                        if constexpr(std::is_same_v<VLayout,
+                                                    ck_tile::tensor_layout::gemm::RowMajor>)
+                        {
+                            auto v_shuffle_tmp = make_static_distributed_tensor<VDataType>(
+                                Policy::template MakeShuffledVRegBlockDescriptor<Problem>());
+                            shuffle_tile(v_shuffle_tmp, v_tiles[I0]);
+                            store_tile(v_lds_windows[number<(i_k1 + 1) % NumVLdsBuffers>{}],
+                                       tile_elementwise_in(v_element_func, v_shuffle_tmp));
+                        }
+                        else
+                        {
+                            store_tile(v_lds_windows[number<(i_k1 + 1) % NumVLdsBuffers>{}],
+                                       tile_elementwise_in(v_element_func, v_tiles[I0]));
+                        }
+
+                        move_tile_window(v_dram_window, {0, kK1});
+                    });
+                }
+                else // NumVLdsBuffers == 3 or 2
+                {
+                    static_for<0, k1_loops - 1, 1>{}([&](auto i_k1) {
+                        if constexpr(i_k1 < k1_loops - NumPrefetchV)
+                            v_tiles[number<i_k1 % NumPrefetchV>{}] = load_tile(v_dram_window);
+
+                        block_sync_lds();
+                        gemm_1(o_acc,
+                               get_slice_tile(
+                                   p, sequence<0, i_k1 * kK1>{}, sequence<kM0, (i_k1 + 1) * kK1>{}),
+                               v_lds_windows[number<i_k1 % NumVLdsBuffers>{}]);
+
+                        if constexpr(std::is_same_v<VLayout,
+                                                    ck_tile::tensor_layout::gemm::RowMajor>)
+                        {
+                            auto v_shuffle_tmp = make_static_distributed_tensor<VDataType>(
+                                Policy::template MakeShuffledVRegBlockDescriptor<Problem>());
+                            shuffle_tile(v_shuffle_tmp,
+                                         v_tiles[number<(i_k1 + 1) % NumPrefetchV>{}]);
+                            store_tile(v_lds_windows[number<(i_k1 + 1) % NumVLdsBuffers>{}],
+                                       tile_elementwise_in(v_element_func, v_shuffle_tmp));
+                        }
+                        else
+                        {
+                            store_tile(
+                                v_lds_windows[number<(i_k1 + 1) % NumVLdsBuffers>{}],
+                                tile_elementwise_in(v_element_func,
+                                                    v_tiles[number<(i_k1 + 1) % NumPrefetchV>{}]));
+                        }
+
+                        if constexpr(i_k1 < k1_loops - NumPrefetchV)
+                            move_tile_window(v_dram_window, {0, kK1});
+                    });
+                }
+            }
+            // move K tile windows
+            move_tile_window(k_dram_block_window, {kN0, 0});
+
+            block_sync_lds();
+            gemm_1(o_acc,
+                   get_slice_tile(p, sequence<0, (k1_loops - 1) * kK1>{}, sequence<kM0, kN0>{}),
+                   v_lds_windows[number<(k1_loops - 1) % NumVLdsBuffers>{}]);
+
+            if constexpr(Policy::template IsFirstKLdsBufferOverlapLastVLdsBuffer<Problem>())
+            {
+                __builtin_amdgcn_sched_barrier(0);
+                __builtin_amdgcn_s_barrier();
+            };
+
+        } while(++i_total_loops < num_total_loop);
+
+        // store lse
+        if constexpr(kStoreLSE)
+        {
+            auto lse = make_static_distributed_tensor<LSEDataType>(m.get_tile_distribution());
+
+            constexpr auto lse_spans = decltype(lse)::get_distributed_spans();
+            sweep_tile_span(lse_spans[number<0>{}], [&, m_ = m, l_ = l](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                             BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                {
+                    lse(i_idx) = m_[i_idx] / C_LOG2E + log(l_[i_idx]);
+                }
+                else
+                {
+                    lse(i_idx) = m_[i_idx] * scale_s / C_LOG2E + log(l_[i_idx]);
+                }
+#else
+                lse(i_idx) = m_[i_idx] + log(l_[i_idx]);
+#endif
+            });
+
+            store_tile(lse_dram_window_tmp, tile_elementwise_in(lse_element_func, lse));
+        }
+
+        // finally, O
+        constexpr auto o_spans = decltype(o_acc)::get_distributed_spans();
+
+        sweep_tile_span(o_spans[number<0>{}], [&](auto idx0) {
+            constexpr auto i_idx = make_tuple(idx0);
+            const auto tmp       = [&]() {
+                if constexpr(FmhaMask::IsMasking)
+                {
+                    return l[i_idx] == 0.f ? 0.f : 1 / l[i_idx];
+                }
+                else
+                    return 1 / l[i_idx];
+            }();
+            sweep_tile_span(o_spans[number<1>{}], [&](auto idx1) {
+                constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                o_acc(i_j_idx) *= tmp;
+            });
+        });
+
+        o_acc = tile_elementwise_in(o_acc_element_func, o_acc);
+
+        return o_acc;
+    }
+
+    template <typename QDramBlockWindowTmp,
+              typename KDramBlockWindowTmp,
+              typename VDramBlockWindowTmp,
+              typename BiasDramBlockWindowTmp,
+              typename RandValDramBlockWindowTmp,
+              typename LSEDramBlockWindowTmp,
+              typename PositionEncoding>
+    CK_TILE_HOST_DEVICE auto
+    operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp,       // M0*K0 tile
+               const KDramBlockWindowTmp& k_dram_block_window_tmp,       // N0*K0 tile
+               const VDramBlockWindowTmp& v_dram_block_window_tmp,       // N1*K1 tile
+               const BiasDramBlockWindowTmp& bias_dram_block_window_tmp, // M0*N0 tile
+               RandValDramBlockWindowTmp& randval_dram_block_window_tmp, // M0*N0 tile
+               LSEDramBlockWindowTmp& lse_dram_block_window_tmp,         // M0*1 tile
+               FmhaMask mask,
+               PositionEncoding position_encoding,
+               float scale_s,
+               void* smem_ptr,
+               DropoutType& dropout) const
+    {
+        return operator()(q_dram_block_window_tmp,
+                          identity{},
+                          k_dram_block_window_tmp,
+                          identity{},
+                          v_dram_block_window_tmp,
+                          identity{},
+                          bias_dram_block_window_tmp,
+                          identity{},
+                          randval_dram_block_window_tmp,
+                          lse_dram_block_window_tmp,
+                          identity{},
+                          identity{},
+                          identity{},
+                          identity{},
+                          mask,
+                          position_encoding,
+                          scale_s,
+                          smem_ptr,
+                          dropout);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch_default_policy.hpp
new file mode 100644
index 0000000000..67ab548dab
--- /dev/null
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch_default_policy.hpp
@@ -0,0 +1,379 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp"
+
+namespace ck_tile {
+
+struct BlockFmhaPipelineQRKSVSWholeKPrefetchDefaultPolicy
+    : BlockFmhaPipelineQXKSVSCustomPolicy</* QLoadOnce = */ true,
+                                          /* AsyncCopy = */ false,
+                                          /* NumPrefetchK = */ -1,
+                                          /* NumPrefetchV = */ 2>
+{
+    static constexpr index_t NumPrefetchV = 2;
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t IsPreloadWholeNextIterationK()
+    {
+        return Problem::BlockFmhaShape::kM0 <= 64;
+    };
+
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto GetNumKLdsBuffers()
+    {
+        return 2;
+    }
+
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto GetNumPrefetchV()
+    {
+        using BlockFmhaShape = remove_cvref_t<typename Problem::BlockFmhaShape>;
+
+        constexpr index_t kN0 = BlockFmhaShape::kN0;
+        constexpr index_t kK1 = BlockFmhaShape::kK1;
+
+        constexpr index_t k1_loops = kN0 / kK1;
+
+        return min(NumPrefetchV, k1_loops);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetNumVLdsBuffers()
+    {
+        return 2;
+    };
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeQRegTileDistribution()
+    {
+        using BlockGemm = remove_cvref_t<decltype(GetQKBlockGemm<Problem>())>;
+
+        return BlockGemm::template MakeABlockTileDistribution<
+            Problem::BlockFmhaShape::kM0,
+            Problem::BlockFmhaShape::kQKHeaddim>();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemKPackK()
+    {
+        using KDataType = remove_cvref_t<typename Problem::KDataType>;
+        return 8 / sizeof(KDataType);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeKLdsBlockDescriptor()
+    {
+        constexpr index_t NumKLdsBuffers = GetNumKLdsBuffers<Problem>();
+        constexpr index_t kNPerBlock     = Problem::BlockFmhaShape::kN0;
+        constexpr index_t kKPerBlock     = Problem::BlockFmhaShape::kK0;
+        constexpr index_t kKPack         = GetSmemKPackK<Problem>();
+        constexpr index_t kKVector       = GetAlignmentK<Problem>();
+
+        static_assert(kKVector % kKPack == 0);
+
+        constexpr auto k_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<NumKLdsBuffers>{},
+                       number<kKPerBlock / kKVector>{},
+                       number<kKVector / kKPack>{},
+                       number<kNPerBlock>{},
+                       number<kKPack>{}),
+            make_tuple(number<kKPerBlock * kNPerBlock + kKPerBlock * kKPack / kKVector>{},
+                       number<kNPerBlock * kKVector + kKPack>{},
+                       number<kNPerBlock * kKPack>{},
+                       number<kKPack>{},
+                       number<1>{}),
+            number<kKPack>{},
+            number<1>{});
+
+        constexpr auto k_lds_block_desc = transform_tensor_descriptor(
+            k_lds_block_desc_0,
+            make_tuple(
+                make_merge_transform(make_tuple(number<NumKLdsBuffers>{}, number<kNPerBlock>{})),
+                make_merge_transform(make_tuple(number<kKPerBlock / kKVector>{},
+                                                number<kKVector / kKPack>{},
+                                                number<kKPack>{}))),
+            make_tuple(sequence<0, 3>{}, sequence<1, 2, 4>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return k_lds_block_desc;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeKDramTileDistribution()
+    {
+        using KDataType = remove_cvref_t<typename Problem::KDataType>;
+
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK0;
+
+        constexpr index_t MaxVectorSize = 16 / sizeof(KDataType);
+
+        constexpr index_t ElemPerThread = (kNPerBlock * kKPerBlock) / kBlockSize;
+        static_assert(0 < ElemPerThread);
+        constexpr index_t kMaxVecLoad = min(ElemPerThread, MaxVectorSize);
+
+        constexpr index_t KPerThread     = kMaxVecLoad;
+        constexpr index_t KThreads       = kKPerBlock / KPerThread;
+        constexpr index_t NThreadPerWarp = get_warp_size() / KThreads;
+        constexpr index_t NumWarps       = kBlockSize / get_warp_size();
+        constexpr index_t NPerThread     = kNPerBlock / (NThreadPerWarp * NumWarps);
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<1>,
+                                       tuple<sequence<NPerThread, NThreadPerWarp, NumWarps>,
+                                             sequence<KThreads, KPerThread>>,
+                                       tuple<sequence<1>, sequence<1, 2>>,
+                                       tuple<sequence<2>, sequence<1, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 1>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeVLdsBlockDescriptor()
+    {
+        using VDataType = remove_cvref_t<typename Problem::VDataType>;
+
+        constexpr index_t NumVLdsBuffers = GetNumVLdsBuffers<Problem>();
+
+        constexpr index_t Banks        = 32; // TODO: need change based on arch
+        constexpr index_t PixelsPerRow = Banks * 4 / sizeof(VDataType);
+        constexpr index_t kKPack       = GetSmemKPackV<Problem>();
+        static_assert(PixelsPerRow % kKPack == 0);
+        constexpr index_t NPerRow    = PixelsPerRow / kKPack;
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN1;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK1;
+        static_assert(kNPerBlock % NPerRow == 0);
+        static_assert(kKPerBlock % kKPack == 0);
+
+        constexpr index_t VSingleSmemElementSpaceSize =
+            (kKPerBlock / kKPack) * (kNPerBlock / NPerRow) * (PixelsPerRow + kKPack);
+
+        constexpr auto v_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<NumVLdsBuffers>{},
+                       number<kKPerBlock / kKPack>{},
+                       number<kNPerBlock / NPerRow>{},
+                       number<NPerRow>{},
+                       number<kKPack>{}),
+            make_tuple(number<VSingleSmemElementSpaceSize>{},
+                       number<(kNPerBlock / NPerRow) * (PixelsPerRow + kKPack)>{},
+                       number<PixelsPerRow + kKPack>{},
+                       number<kKPack>{},
+                       number<1>{}),
+            number<kKPack>{},
+            number<1>{});
+
+        constexpr auto v_lds_block_desc = transform_tensor_descriptor(
+            v_lds_block_desc_0,
+            make_tuple(
+                make_merge_transform(make_tuple(
+                    number<NumVLdsBuffers>{}, number<kNPerBlock / NPerRow>{}, number<NPerRow>{})),
+                make_merge_transform(make_tuple(number<kKPerBlock / kKPack>{}, number<kKPack>{}))),
+            make_tuple(sequence<0, 2, 3>{}, sequence<1, 4>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return v_lds_block_desc;
+    }
+
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeVDramTileDistribution()
+    {
+        using VLayout = remove_cvref_t<typename Problem::BlockFmhaShape::VLayout>;
+
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN1;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK1;
+
+        if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+        {
+            constexpr index_t N1 = GetAlignmentV<Problem>();
+            constexpr index_t N0 = kNPerBlock / N1; // P
+
+            constexpr index_t ElemPerThread = kNPerBlock * kKPerBlock / kBlockSize;
+            static_assert(ElemPerThread % N1 == 0);
+            constexpr index_t K3     = ElemPerThread / N1;
+            constexpr index_t kKPack = GetSmemKPackV<Problem>();
+            static_assert(kKPack % K3 == 0);
+            constexpr index_t K2 = kKPack / K3;
+            if constexpr(get_warp_size() % (K2 * N0) == 0)
+            {
+                constexpr index_t K1 = get_warp_size() / (K2 * N0);
+                constexpr index_t K0 = kBlockSize / get_warp_size();
+                static_assert(kKPerBlock == K0 * K1 * K2 * K3);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<N0, N1>, sequence<K0, K1, K2, K3>>,
+                                               tuple<sequence<2>, sequence<2, 1, 2>>,
+                                               tuple<sequence<0>, sequence<1, 0, 2>>,
+                                               sequence<2, 1>,
+                                               sequence<3, 1>>{});
+            }
+            else
+            {
+                constexpr index_t K1   = (K2 * N0) / get_warp_size();
+                constexpr index_t K2_m = K2 / K1;
+                constexpr index_t K0   = kBlockSize / get_warp_size() / K1;
+                static_assert(kKPerBlock == K0 * K1 * K2_m * K3);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<N0, N1>, sequence<K0, K1, K2_m, K3>>,
+                                               tuple<sequence<2, 2>, sequence<1, 2>>,
+                                               tuple<sequence<0, 1>, sequence<0, 2>>,
+                                               sequence<2, 1>,
+                                               sequence<3, 1>>{});
+            }
+        }
+        else
+        {
+            constexpr index_t K1 = GetAlignmentV<Problem>();
+            constexpr index_t K0 = kKPerBlock / K1;
+            constexpr index_t N2 = get_warp_size() / K0;
+            constexpr index_t N1 = kBlockSize / get_warp_size();
+            static_assert(N2 != 0, "N2 is zero, which will lead to a division by zero error.");
+            static_assert(N1 != 0, "N1 is zero, which will lead to a division by zero error.");
+            constexpr index_t N0 = kNPerBlock / (N2 * N1);
+            static_assert(N0 != 0);
+
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
+                                           tuple<sequence<1>, sequence<1, 2>>,
+                                           tuple<sequence<1>, sequence<2, 0>>,
+                                           sequence<1, 2>,
+                                           sequence<0, 1>>{});
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetQKBlockGemm()
+    {
+        using GemmProblem =
+            BlockGemmProblem<typename Problem::QDataType,
+                             typename Problem::KDataType,
+                             typename Problem::SaccDataType,
+                             Problem::kNumGemm0Warps * get_warp_size(),
+                             TileGemmShape<sequence<Problem::BlockFmhaShape::kM0,
+                                                    Problem::BlockFmhaShape::kN0,
+                                                    Problem::BlockFmhaShape::kK0>,
+                                           typename Problem::BlockFmhaShape::Gemm0BlockWarps,
+                                           typename Problem::BlockFmhaShape::Gemm0WarpTile>>;
+
+        constexpr auto warp_gemm = []() {
+            constexpr index_t WarpGemmM = Problem::BlockFmhaShape::Gemm0WarpTile::at(number<0>{});
+            static_assert(WarpGemmM == 4 || WarpGemmM == 16 || WarpGemmM == 32);
+
+            if constexpr(std::is_same_v<typename Problem::QDataType, half_t> &&
+                         std::is_same_v<typename Problem::KDataType, half_t> &&
+                         std::is_same_v<typename Problem::SaccDataType, float>)
+            {
+                if constexpr(WarpGemmM == 32)
+                    return WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution{};
+                else if constexpr(WarpGemmM == 16)
+                    return WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution{};
+                else // WarpGemmM == 4
+                    return WarpGemmMfmaF16F16F32M4N64K16{};
+            }
+            else if constexpr(std::is_same_v<typename Problem::QDataType, bf16_t> &&
+                              std::is_same_v<typename Problem::KDataType, bf16_t> &&
+                              std::is_same_v<typename Problem::SaccDataType, float>)
+            {
+                if constexpr(WarpGemmM == 32)
+                    return WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution{};
+                else if constexpr(WarpGemmM == 16)
+                    return WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution{};
+                else // WarpGemmM == 4
+                    return WarpGemmMfmaBf16Bf16F32M4N64K16{};
+            }
+            else if constexpr(std::is_same_v<typename Problem::QDataType, fp8_t> &&
+                              std::is_same_v<typename Problem::KDataType, fp8_t> &&
+                              std::is_same_v<typename Problem::SaccDataType, float>)
+            {
+                static_assert(WarpGemmM == 32);
+
+                // TODO: hard coded here. Otherwise, it may incorrect result
+                constexpr index_t swizzle_factor = 4;
+                return WarpGemmMfmaFp8Fp8F32M32N32K16SwizzleBTransposedCDistribution<
+                    swizzle_factor>{};
+            } // TODO - bf8_t
+        }();
+
+        using BlockGemmPolicy =
+            BlockGemmARegBSmemCRegV2CustomPolicy<typename Problem::QDataType,
+                                                 typename Problem::KDataType,
+                                                 typename Problem::SaccDataType,
+                                                 typename Problem::BlockFmhaShape::Gemm0BlockWarps,
+                                                 decltype(warp_gemm)>;
+
+        if constexpr(1 < Problem::kNumGemm0Warps)
+            return BlockGemmARegBSmemCRegV2<GemmProblem, BlockGemmPolicy>{};
+        else
+            return BlockGemmARegBSmemCRegOneWarpV1<GemmProblem, BlockGemmPolicy>{};
+    }
+
+    // leave some exclusive space so that the second v_lds buffer will nenver overlap with the first
+    // k_lds bufffer
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetExclusiveKLdsBytes()
+    {
+        constexpr index_t single_k_lds_buffer_size =
+            GetSmemSizeK<Problem>() / GetNumKLdsBuffers<Problem>();
+        constexpr index_t single_v_lds_buffer_size =
+            GetSmemSizeV<Problem>() / GetNumVLdsBuffers<Problem>();
+
+        if constexpr(single_k_lds_buffer_size <= single_v_lds_buffer_size)
+            return 0;
+        else
+            return integer_least_multiple(single_k_lds_buffer_size - single_v_lds_buffer_size, 64);
+    };
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t IsFirstKLdsBufferOverlapLastVLdsBuffer()
+    {
+        using BlockFmhaShape = remove_cvref_t<typename Problem::BlockFmhaShape>;
+
+        constexpr index_t k1_loops          = BlockFmhaShape::kN0 / BlockFmhaShape::kK1;
+        constexpr index_t num_k_lds_buffers = GetNumKLdsBuffers<Problem>();
+        constexpr index_t num_v_lds_buffers = GetNumVLdsBuffers<Problem>();
+
+        constexpr index_t last_v_lds_buffer_offset =
+            MakeVLdsBlockDescriptor<Problem>().get_element_space_size() / num_v_lds_buffers *
+            ((k1_loops - 1) % num_v_lds_buffers) * sizeof(typename Problem::VDataType);
+
+        constexpr index_t first_k_lds_buffer_size =
+            MakeKLdsBlockDescriptor<Problem>().get_element_space_size() / num_k_lds_buffers *
+            sizeof(typename Problem::KDataType);
+
+        return GetExclusiveKLdsBytes<Problem>() + last_v_lds_buffer_offset <
+               first_k_lds_buffer_size;
+    };
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeK()
+    {
+        return MakeKLdsBlockDescriptor<Problem>().get_element_space_size() *
+               sizeof(typename Problem::KDataType);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeV()
+    {
+        return MakeVLdsBlockDescriptor<Problem>().get_element_space_size() *
+               sizeof(typename Problem::VDataType);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
+    {
+        // assume V can reuse the other shared memory by K except the first
+        // assume Dropout can reuse the shared memory by V
+        return GetExclusiveKLdsBytes<Problem>() +
+               max(GetSmemSizeK<Problem>() - GetExclusiveKLdsBytes<Problem>(),
+                   max(GetSmemSizeV<Problem>(), GetSmemSizeDropout<Problem>(0)));
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp
index c2223fcee6..7be6a347f5 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp
@@ -94,6 +94,8 @@ struct BlockFmhaPipelineQSKSVS
             {
                 return 1;
             }
+            else
+                return 1;
         }
     }();
 
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs_default_policy.hpp
index ff8299b4ff..7505dbb172 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs_default_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs_default_policy.hpp
@@ -11,8 +11,7 @@ namespace ck_tile {
 // This pipeline is qkv all located in LDS
 struct BlockFmhaPipelineQSKSVSDefaultPolicy
     : BlockFmhaPipelineQXKSVSCustomPolicy</* QLoadOnce = */ false,
-                                          /* AsyncCopyK = */ false,
-                                          /* AsyncCopyV = */ false,
+                                          /* AsyncCopy = */ false,
                                           /* NumPrefetchK = */ 1,
                                           /* NumPrefetchV = */ 1>
 {
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
index 3db461e971..26f7e46f9f 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
@@ -17,9 +17,6 @@
 #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp"
 
-// TODO: remove this
-#define K_LDS_LOAD_USE_OFFSET_TRANSFORM 0
-
 namespace ck_tile {
 
 template <bool QLoadOnce_>
@@ -50,9 +47,11 @@ struct BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ true>
         return min(MaxVectorSize, WG::kK / WG::WarpGemmAttribute::Impl::kABKLane);
     }
 
-    template <typename Problem, typename BlockGemm>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeQDramTileDistribution()
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeQRegTileDistribution()
     {
+        using BlockGemm = remove_cvref_t<decltype(GetQKBlockGemm<Problem>())>;
+
         return BlockGemm::template MakeABlockTileDistribution<
             Problem::BlockFmhaShape::kM0,
             Problem::BlockFmhaShape::kSubQKHeaddim>();
@@ -278,37 +277,43 @@ struct BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ false>
 };
 
 // This pipeline is qkv all located in LDS
-template <bool QLoadOnce_,
-          bool AsyncCopyK_,
-          bool AsyncCopyV_,
-          index_t NumPrefetchK_,
-          index_t NumPrefetchV_>
+template <bool QLoadOnce_, bool AsyncCopy_, index_t NumPrefetchK_, index_t NumPrefetchV_>
 struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLoadOnce_>
 {
-    static constexpr bool AsyncCopyK = AsyncCopyK_;
-    static constexpr bool AsyncCopyV = AsyncCopyV_; // TODO: this not supported yet
+    static constexpr bool AsyncCopy = AsyncCopy_;
 
     static constexpr index_t NumPrefetchK = NumPrefetchK_;
     static constexpr index_t NumPrefetchV = NumPrefetchK_;
 
+    static constexpr index_t NumKVLdsBuffers = max(NumPrefetchK, NumPrefetchV);
+
     using QXPolicy = BlockFmhaPipelineQXCustomPolicy<QLoadOnce_>;
 
     template <index_t k_prefetches_, index_t v_prefetches_, index_t k_loops_, index_t v_loops_>
     struct LdsBufferSequence
     {
+        static constexpr index_t num_lds_buffers_ = max(k_prefetches_, v_prefetches_);
+        static constexpr index_t ceil_ = ((v_loops_ - 1) / num_lds_buffers_) * num_lds_buffers_;
+
+        // for qr_ks_vs_async, the Lds buffer assigned to last gemm_1 iteration of V should not
+        // overlap with the Lds buffers used by first two gemm_0 iterations of K
         static constexpr auto Make()
         {
+            // ensure v_loop_-1 is assigned to num_lds_buffers-1
             return transform_sequences(
                 [&](auto i) {
                     if(i < k_loops_)
-                        return i % k_prefetches_;
-                    return (i - k_loops_) % v_prefetches_;
+                        return i % num_lds_buffers_;
+                    else
+                        return ((num_lds_buffers_ - 1) + (i - k_loops_ + ceil_ - (v_loops_ - 1))) %
+                               num_lds_buffers_;
                 },
                 typename arithmetic_sequence_gen<0, k_loops_ + v_loops_, 1>::type{});
         };
 
         using type = remove_cvref_t<decltype(Make())>;
     };
+
     // clang-format off
     template<> struct
     LdsBufferSequence<3, 3, 4, 4> { using type = sequence<1, 2, 0, 1,   0, 1, 2, 0>; };
@@ -357,13 +362,20 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
     CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentK()
     {
         using KDataType = remove_cvref_t<typename Problem::KDataType>;
-        if constexpr(AsyncCopyK)
+        if constexpr(AsyncCopy)
         {
             return 4 / sizeof(KDataType);
         }
         else
         {
-            return 16 / sizeof(KDataType);
+            constexpr index_t kBlockSize = Problem::kBlockSize;
+            constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
+            constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK0;
+
+            constexpr index_t MaxVectorSize = 16 / sizeof(KDataType);
+            constexpr index_t ElemPerThread = (kNPerBlock * kKPerBlock) / kBlockSize;
+
+            return min(MaxVectorSize, ElemPerThread);
         }
     }
 
@@ -427,7 +439,7 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
     {
         // this function assume K/V can share smem
         constexpr index_t SingleKSize = [&]() {
-            if constexpr(!AsyncCopyK)
+            if constexpr(!AsyncCopy)
             {
                 return MakeKLdsBlockDescriptor<Problem>().get_element_space_size();
             }
@@ -549,55 +561,6 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
         return k_lds_block_desc_issues_warps_lanes;
     }
 
-#if K_LDS_LOAD_USE_OFFSET_TRANSFORM
-    template <typename Problem, index_t IBuf = 0>
-    CK_TILE_HOST_DEVICE static constexpr auto
-        MakeKLdsLoadBlockDescriptor(number<IBuf> = number<0>{})
-    {
-        // K is always k-major, we use async-copy to load into LDS
-        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
-        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK1;
-        constexpr index_t kBlockSize = Problem::kBlockSize;
-        constexpr index_t NumWarps   = Problem::BlockFmhaShape::NumWarps;
-        constexpr index_t warpSize   = ck_tile::get_warp_size();
-
-        constexpr index_t KPack   = GetSmemKPackK<Problem>(); // this is for lds
-        constexpr index_t KVector = GetAlignmentK<Problem>(); // this is for global load
-        constexpr index_t kPad    = KPack; // for async-copy, this pad is between warps
-
-        static_assert(warpSize * KVector >= kKPerBlock && warpSize * KVector % kKPerBlock == 0);
-        constexpr index_t LanesPerK  = kKPerBlock / KVector; // within a wave
-        constexpr index_t LaneGroups = warpSize / LanesPerK; // within a wave
-        constexpr index_t NumIssues  = kNPerBlock / (LaneGroups * NumWarps);
-        static_assert(NumIssues == kNPerBlock * kKPerBlock / (kBlockSize * KVector));
-
-        constexpr auto k_lds_block_desc_0 = make_naive_tensor_descriptor_with_offset(
-            make_tuple(number<NumIssues>{},          // n0
-                       number<NumWarps>{},           // n2
-                       number<LaneGroups>{},         // n1
-                       number<kKPerBlock / KPack>{}, // k0
-                       number<KPack>{}),             // k1
-            make_tuple(number<NumWarps*(warpSize * KVector + kPad)>{},
-                       number<warpSize * KVector + kPad>{},
-                       number<kKPerBlock>{},
-                       number<KPack>{},
-                       number<1>{}),
-            number<IBuf * GetSingleSmemElementSpaceSize<Problem>()>{},
-            number<KPack>{},
-            number<1>{});
-
-        constexpr auto k_lds_block_desc = transform_tensor_descriptor(
-            k_lds_block_desc_0,
-            make_tuple(
-                make_merge_transform(
-                    make_tuple(number<NumIssues>{}, number<LaneGroups>{}, number<NumWarps>{})),
-                make_merge_transform(make_tuple(number<kKPerBlock / KPack>{}, number<KPack>{}))),
-            make_tuple(sequence<0, 2, 1>{}, sequence<3, 4>{}),
-            make_tuple(sequence<0>{}, sequence<1>{}));
-
-        return k_lds_block_desc;
-    }
-#else
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeKLdsLoadBlockDescriptor()
     {
@@ -624,7 +587,7 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
             GetSingleSmemElementSpaceSize<Problem>(); //  max(SingleKSize, SingleVSize);
 
         constexpr auto k_lds_block_desc_0 =
-            make_naive_tensor_descriptor(make_tuple(number<NumPrefetchK>{},       // num_buffers
+            make_naive_tensor_descriptor(make_tuple(number<NumKVLdsBuffers>{},    // num_buffers
                                                     number<NumIssues>{},          // n0
                                                     number<NumWarps>{},           // n2
                                                     number<LaneGroups>{},         // n1
@@ -642,7 +605,7 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
         constexpr auto k_lds_block_desc = transform_tensor_descriptor(
             k_lds_block_desc_0,
             make_tuple(
-                make_merge_transform(make_tuple(number<NumPrefetchK>{},
+                make_merge_transform(make_tuple(number<NumKVLdsBuffers>{},
                                                 number<NumIssues>{},
                                                 number<LaneGroups>{},
                                                 number<NumWarps>{})),
@@ -652,7 +615,6 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
 
         return k_lds_block_desc;
     }
-#endif
 
     // 3d + padding
     template <typename Problem>
@@ -670,7 +632,7 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
         static_assert(kKPerBlock % kKPack == 0);
 
         constexpr auto v_lds_block_desc_0 = make_naive_tensor_descriptor(
-            make_tuple(number<NumPrefetchV>{},
+            make_tuple(number<NumKVLdsBuffers>{},
                        number<kKPerBlock / kKPack>{},
                        number<kNPerBlock / NPerRow>{},
                        number<NPerRow>{},
@@ -687,7 +649,7 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
             v_lds_block_desc_0,
             make_tuple(
                 make_merge_transform(make_tuple(
-                    number<NumPrefetchV>{}, number<kNPerBlock / NPerRow>{}, number<NPerRow>{})),
+                    number<NumKVLdsBuffers>{}, number<kNPerBlock / NPerRow>{}, number<NPerRow>{})),
                 make_merge_transform(make_tuple(number<kKPerBlock / kKPack>{}, number<kKPack>{}))),
             make_tuple(sequence<0, 2, 3>{}, sequence<1, 4>{}),
             make_tuple(sequence<0>{}, sequence<1>{}));
@@ -703,14 +665,13 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
         constexpr index_t single_smem_size =
             GetSingleSmemElementSpaceSize<Problem>() * sizeof(typename Problem::KDataType);
 
-        return QXPolicy::template GetSmemSizeQ<Problem>() +
-               single_smem_size * max(NumPrefetchK, NumPrefetchV);
+        return QXPolicy::template GetSmemSizeQ<Problem>() + single_smem_size * NumKVLdsBuffers;
     }
 
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
     {
-        if constexpr(AsyncCopyK)
+        if constexpr(AsyncCopy)
         {
             return GetSmemSizeKV<Problem>() + GetSmemSizeDropout<Problem>(0);
         }
@@ -754,7 +715,7 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeKDramTileDistribution()
     {
-        if constexpr(!AsyncCopyK)
+        if constexpr(!AsyncCopy)
         {
             using KDataType = remove_cvref_t<typename Problem::KDataType>;
 
@@ -762,7 +723,10 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
             constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
             constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK0;
 
-            constexpr index_t K1 = 16 / sizeof(KDataType);
+            constexpr index_t MaxVectorSize = 16 / sizeof(KDataType);
+            constexpr index_t ElemPerThread = (kNPerBlock * kKPerBlock) / kBlockSize;
+
+            constexpr index_t K1 = min(MaxVectorSize, ElemPerThread);
             constexpr index_t K0 = kKPerBlock / K1;
             constexpr index_t N2 = get_warp_size() / K0;
             constexpr index_t N1 = kBlockSize / get_warp_size();
diff --git a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
index 5ce80c2d1f..76ba34115f 100644
--- a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
@@ -13,6 +13,8 @@ static CK_TILE_HOST_DEVICE constexpr index_t ceil_to_qualified_tile_length(index
         return 128;
     if(len == 160)
         return 256;
+    if(len == 192)
+        return 192;
 
     // only length of 96, 160 and power-of-two is supported
     if(!(len & (len - 1)))
diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
index 340f6cb9e5..6a7ccd2472 100644
--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
@@ -101,7 +101,7 @@ namespace ck_tile {
 //   max_num_tokens_padded: opk_ids.numel() + num_experts * (block_size - 1)
 
 
-CK_TILE_HOST constexpr auto moe_sorting_get_smem_row_col(int num_tokens_, int num_experts_)
+CK_TILE_HOST constexpr auto moe_sorting_get_smem_row_col(int tokens_, int num_experts_)
 {
     /*               num_experts + 1
     *   +--------------------------------------+
@@ -132,7 +132,7 @@ CK_TILE_HOST constexpr auto moe_sorting_get_smem_row_col(int num_tokens_, int nu
 
         // round to sub_unroll multipl
         int r_for_sub_token = r - cumsum_bufs;
-        r_for_sub_token = min(r_for_sub_token, num_tokens_);
+        r_for_sub_token = min(r_for_sub_token, tokens_);
         r_for_sub_token = (r_for_sub_token + sub_unroll - 1) / sub_unroll * sub_unroll;
         r_for_sub_token = max(r_for_sub_token, 1);
 
@@ -148,7 +148,6 @@ CK_TILE_HOST constexpr auto moe_sorting_get_smem_row_col(int num_tokens_, int nu
             
             mask_ = mask_ > 0b111 ? 0b111 : mask_;  //clamp to 8x at most
             mask_ = ~mask_;
-            //printf("r_unroll_:%d, clz:%d, mask:%x\n", r_unroll_, clz_, mask_); fflush(stdout);
 
             r_for_sub_token = (r_unroll_ & mask_) * sub_unroll;
         }
@@ -161,11 +160,17 @@ CK_TILE_HOST constexpr auto moe_sorting_get_smem_row_col(int num_tokens_, int nu
         return r_for_sub_token + cumsum_bufs;
     }();
 
-    // printf("r:%d, c:%d\n", smem_rows, smem_cols);
-
     return ck_tile::make_tuple(smem_rows, smem_cols);
 }
 
+CK_TILE_HOST index_t moe_sorting_get_sub_token(int tokens_, int num_experts_)
+{
+    auto [r_, c_]            = moe_sorting_get_smem_row_col(tokens_, num_experts_);
+    auto sub_token_          = r_ - 2;
+    (void) c_;
+    return sub_token_;
+}
+
 struct MoeSortingHostArgs
 {
     const void* p_topk_ids;     // [token, topk]
@@ -180,11 +185,14 @@ struct MoeSortingHostArgs
     // we fused the setzero of output of fused-moe buffer
     // set this pointer to nullptr will skip this operation
     void* p_moe_buf;
+    void* p_ws;             // size is moe_sorting_get_workspace_size()
+                            // if return zero, then could be nullptr
+                            // must be cleard before use
     index_t tokens;
     index_t unit_size;      // this is the M_a of fused-moe kernel
     index_t num_experts;
     index_t topk;
-    index_t moe_buf_bytes;  // byte size of p_moe_buf
+    long_index_t moe_buf_bytes;  // byte size of p_moe_buf
 };
 
 template <typename Problem_>
@@ -211,7 +219,7 @@ struct MoeSortingKernel
         void* p_moe_buf;
         index_t tokens;
         index_t num_experts;
-        index_t moe_buf_bytes;
+        long_index_t moe_buf_bytes;
 
         index_t tokens_per_thread;
         index_t smem_rows;
@@ -418,7 +426,7 @@ struct MoeSortingKernel
         return row * total_col + col;
     }
 
-    CK_TILE_DEVICE void moe_buf_set_zero_kernel(uint8x16_t* buf, index_t buf_bytes) const
+    CK_TILE_DEVICE void moe_buf_set_zero_kernel(uint8x16_t* buf, long_index_t buf_bytes) const
     {
         const index_t offset = (blockIdx.x - 1) * blockDim.x + threadIdx.x;
         if(offset < buf_bytes / 16)
@@ -1046,6 +1054,818 @@ struct MoeSortingKernel
     }
 };
 
+namespace impl {
+
+// [expert, padded_tokens]
+CK_TILE_HOST_DEVICE index_t moe_sorting_mp_mesh_stride(index_t tokens)
+{
+    constexpr index_t chunk = 32;
+    return (tokens + chunk - 1) / chunk * chunk;
+};
+
+CK_TILE_HOST_DEVICE index_t moe_sorting_mp_mesh_elem(index_t tokens, index_t num_experts)
+{
+    index_t row_size = moe_sorting_mp_mesh_stride(tokens);
+    return num_experts * row_size;
+};
+
+CK_TILE_HOST_DEVICE index_t moe_sorting_mp_cumsum_elem(index_t num_experts)
+{
+    constexpr index_t chunk = 32;
+    index_t row_size        = num_experts + 1;
+    return (row_size + chunk - 1) / chunk * chunk;
+};
+
+template <typename T, typename F, index_t wave_size_ = warpSize>
+CK_TILE_DEVICE constexpr T moe_sorting_wave_reduce(T local, F reduce_f, number<wave_size_> = {})
+{
+    // constexpr int wave_size = 64;
+    // constexpr int reduce_stage = 6; // 1<<6=64
+    // clang-format off
+    constexpr int reduce_stage = [](){
+        if constexpr(wave_size_ == 2) return 1;
+        else if constexpr(wave_size_ == 4) return 2;
+        else if constexpr(wave_size_ == 8) return 3;
+        else if constexpr(wave_size_ == 16) return 4;
+        else if constexpr(wave_size_ == 32) return 5;
+        else if constexpr(wave_size_ == 64) return 6;
+        else return 0;
+    }();
+    // clang-format on
+    T v_local = local;
+#pragma unroll reduce_stage
+    for(int i_stage = 0; i_stage < reduce_stage; i_stage++)
+    {
+        int src_lane = __lane_id() ^ (1 << i_stage);
+        int32_t v_remote_tmp =
+            __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(v_local));
+        T v_remote = bit_cast<T>(v_remote_tmp);
+        v_local    = reduce_f(v_local, v_remote);
+    }
+    return v_local;
+}
+
+// [a, b, c, d....] -> [a, a+b, a+b+c, a+b+c+d, ....]
+// NOTE: wave_size need at least be 16!! dpp 16 is one row
+template <typename data_t, int wave_size>
+CK_TILE_DEVICE void moe_sorting_wave_cumsum(data_t& thread_data)
+{
+    // wave_size must be power of 2
+    constexpr int row_mask    = 0xf;
+    constexpr int bank_mask   = 0xf;
+    constexpr bool bound_ctrl = true; // ! out-of-bound is zero !
+    auto reduce_op            = [&](auto x_, auto y_) { return x_ + y_; };
+
+    if constexpr(wave_size > 1)
+    {
+        thread_data = reduce_op(
+            thread_data,
+            __builtin_bit_cast(data_t,
+                               __builtin_amdgcn_mov_dpp(__builtin_bit_cast(int, thread_data),
+                                                        0x111,
+                                                        row_mask,
+                                                        bank_mask,
+                                                        bound_ctrl))); // row_shr:1
+    }
+
+    if constexpr(wave_size > 2)
+    {
+        thread_data = reduce_op(
+            thread_data,
+            __builtin_bit_cast(data_t,
+                               __builtin_amdgcn_mov_dpp(__builtin_bit_cast(int, thread_data),
+                                                        0x112,
+                                                        row_mask,
+                                                        bank_mask,
+                                                        bound_ctrl))); // row_shr:2
+    }
+    if constexpr(wave_size > 4)
+    {
+        thread_data = reduce_op(
+            thread_data,
+            __builtin_bit_cast(data_t,
+                               __builtin_amdgcn_mov_dpp(__builtin_bit_cast(int, thread_data),
+                                                        0x114,
+                                                        row_mask,
+                                                        bank_mask,
+                                                        bound_ctrl))); // row_shr:4
+    }
+    if constexpr(wave_size == 8)
+    {
+
+        // wave-size=8 need one extra shift
+        thread_data = reduce_op(
+            thread_data,
+            __builtin_bit_cast(data_t,
+                               __builtin_amdgcn_mov_dpp(__builtin_bit_cast(int, thread_data),
+                                                        0x118,
+                                                        row_mask,
+                                                        bank_mask,
+                                                        bound_ctrl))); // row_shr:8
+#if 0
+        constexpr int bank_mask_0_7 = 0b1100;
+        auto reduce_op_r = [&](auto x_, auto y_) { return x_ - y_; };
+        thread_data = reduce_op_r(thread_data, __builtin_bit_cast(data_t,
+                                                __builtin_amdgcn_update_dpp(0, /* old value */
+                                                    __builtin_bit_cast(int, thread_data),
+                                                    0x157,
+                                                    row_mask,
+                                                    bank_mask_0_7,
+                                                    bound_ctrl))// row_newbcast:7
+                                                    );
+#else
+        data_t xxx =
+            __builtin_bit_cast(data_t,
+                               __builtin_amdgcn_mov_dpp(__builtin_bit_cast(int, thread_data),
+                                                        0x157,
+                                                        row_mask,
+                                                        bank_mask,
+                                                        bound_ctrl)); // row_newbcast:7
+
+        data_t yyy  = (__lane_id() / 8) % 2 == 0 ? 0 : xxx;
+        thread_data = thread_data - yyy;
+#endif
+    }
+    if constexpr(wave_size > 8)
+    {
+        thread_data = reduce_op(
+            thread_data,
+            __builtin_bit_cast(data_t,
+                               __builtin_amdgcn_mov_dpp(__builtin_bit_cast(int, thread_data),
+                                                        0x118,
+                                                        row_mask,
+                                                        bank_mask,
+                                                        bound_ctrl))); // row_shr:8
+    }
+
+    if constexpr(wave_size > 16)
+    {
+        // now row-0, row-0+row-1, row-1+row-2, row-2+row-3
+        int v_remote_tmp = __builtin_amdgcn_ds_bpermute(((__lane_id() & 0x30) - 1) << 2,
+                                                        __builtin_bit_cast(int, thread_data));
+        v_remote_tmp     = __lane_id() >= 16 ? v_remote_tmp : 0;
+        thread_data      = reduce_op(thread_data, __builtin_bit_cast(data_t, v_remote_tmp));
+    }
+
+    if constexpr(wave_size > 32)
+    {
+        // lane-id 48...63->31
+        int v_remote_tmp = __builtin_amdgcn_ds_bpermute(((__lane_id() & 0x30) - 17) << 2,
+                                                        __builtin_bit_cast(int, thread_data));
+        v_remote_tmp     = __lane_id() >= 32 ? v_remote_tmp : 0;
+        thread_data      = reduce_op(thread_data, __builtin_bit_cast(data_t, v_remote_tmp));
+    }
+}
+
+template <index_t BLOCK_SIZE = 256>
+CK_TILE_DEVICE void moe_buf_set_zero_kernel(uint8x16_t* buf, long_index_t buf_bytes, index_t gid)
+{
+    // const index_t offset = (blockIdx.x - 1) * BLOCK_SIZE + threadIdx.x;
+    long_index_t offset = static_cast<long_index_t>(gid) * BLOCK_SIZE + threadIdx.x;
+    if(offset < buf_bytes / 16)
+    {
+        buf[offset] = uint8x16_t{0};
+    }
+}
+
+} // namespace impl
+
+// prefer to run mp kernel if is not oneshot
+CK_TILE_HOST bool moe_sorting_is_oneshot(int tokens_, int num_experts_)
+{
+#if CK_TILE_WA_ISSUE_2028
+    if(tokens_ >= 65536 * 2)
+    {
+        return true;
+    }
+#endif
+    auto sub_token_          = moe_sorting_get_sub_token(tokens_, num_experts_);
+    bool is_sub_token_onshot = tokens_ <= sub_token_;
+    return is_sub_token_onshot;
+}
+
+// return size in byte
+CK_TILE_HOST index_t moe_sorting_mp_get_workspace_size(int tokens_, int num_experts_)
+{
+    index_t elem = impl::moe_sorting_mp_mesh_elem(tokens_, num_experts_) +
+                   impl::moe_sorting_mp_cumsum_elem(num_experts_);
+    return elem * sizeof(index_t);
+}
+
+// return size in byte
+CK_TILE_HOST index_t moe_sorting_get_workspace_size(int tokens_, int num_experts_)
+{
+#if 1
+    if(moe_sorting_is_oneshot(tokens_, num_experts_))
+    {
+        return 0;
+    }
+    else
+    {
+        return moe_sorting_mp_get_workspace_size(tokens_, num_experts_);
+    }
+#else
+    return moe_sorting_mp_get_workspace_size(tokens_, num_experts_);
+#endif
+}
+
+// below kernel is multi-phase implementation for large token and/or expert case
+
+// write into a buffer to record the token cnt
+// e.g. num_experts = 6, topk=3, M_a = 4, input_tokens = 5
+// before sort, topk_ids is : [[0, 3, 5], [2, 3, 5], [1, 3, 5], [1, 2, 3], [1, 3, 5]]
+//                            tok-0      tok-1      tok-2      tok-3      tok-4
+//           topk_weight is : [[a, b, c], [d, e, f], [g, h, i], [j, k, l], [m, n, o]] (some float
+//           number)
+//
+// token_id_per_expert is : [[0], [2, 3, 4], [1, 3], [0, 1, 2, 3, 4], [], [0, 1, 2, 5]]
+//  (only for reference)    exp-0  exp-1     exp-2   exp-3          exp-4  exp-5
+// weight_id_per_expert is: [[a], [g, j, m], [d, k], [b, e, h, l, n], [], [c, f, i, o]]
+/*
+
+p_expert_mesh:
+     t0 t1 t2 t3 t4 r5
+    +--+--+--+--+--+--+
+e0  | 1|  |  |  |  |  |
+e1  |  |  | 1| 1| 1|  |
+e2  |  | 1|  | 1|  |  |
+e3  | 1| 1| 1| 1| 1|  |
+e4  |  |  |  |  |  |  |
+e5  | 1| 1| 1|  |  | 1|
+
+
+p_expert_cumsum:
+    | 1| 3| 2| 5| 0| 4|
+     e0 e1 e2 e3 e4 e5
+
+p_expert_cumsum(with M_a pad, and skip zero tokens):
+    | 4| 4| 4| 8| 0| 4|
+     e0 e1 e2 e3 e4 e5
+
+p_expert_cumsum
+    | 0| 4| 8|12|20|20|24|
+
+local_expert_mask : [1, 0, 1, 1, 0, 1] (mask out expert-id=1, 4)
+
+p_m_cumsum
+    | 0| 1| 1| 2| 3| 3| 4|
+
+*/
+
+// count topk_id into mesh
+template <typename Problem_>
+struct MoeSortingMultiPhaseKernel_P0
+{
+    using Problem = remove_cvref_t<Problem_>;
+
+    using IndexType  = typename Problem::IndexType;
+    using WeightType = typename Problem::WeightType;
+
+    static constexpr index_t BLOCK_SIZE = 256;
+    static constexpr index_t OCCUPANCY  = 2; // hard coded
+
+    typedef MoeSortingHostArgs MoeSortingKargs;
+
+    using Hargs = MoeSortingHostArgs;
+
+    struct Kargs
+    {
+        const void* p_topk_ids; // [tokens, topk]
+        void* p_expert_mesh;    // [expert, tokens]
+        index_t tokens;
+        index_t mesh_stride; // mesh_stride for p_expert_mesh
+        mdiv topk_mdiv;
+    };
+
+    CK_TILE_HOST static constexpr auto get_num_cu()
+    {
+        index_t num_cu = [&]() {
+            hipDeviceProp_t dev_prop;
+            hipDevice_t dev;
+            HIP_CHECK_ERROR(hipGetDevice(&dev));
+            HIP_CHECK_ERROR(hipGetDeviceProperties(&dev_prop, dev));
+            return dev_prop.multiProcessorCount;
+        }();
+        return num_cu;
+    }
+
+    CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h)
+    {
+        Kargs k;
+        k.p_topk_ids    = h.p_topk_ids;
+        k.p_expert_mesh = h.p_ws;
+        k.tokens        = h.tokens;
+        k.mesh_stride   = impl::moe_sorting_mp_mesh_stride(h.tokens);
+        k.topk_mdiv     = mdiv{static_cast<uint32_t>(h.topk)};
+        return k;
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs&) { return get_num_cu() * OCCUPANCY; }
+
+    CK_TILE_HOST static constexpr auto BlockSize(const Hargs&) { return dim3(BLOCK_SIZE); }
+
+    // in byte
+    CK_TILE_HOST static constexpr auto GetSmemSize() { return 0; }
+
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        using topk_id_t = ext_vector_t<IndexType, Problem::SubTokenTile>;
+
+        static_assert(Problem::SubTokenTile == 1 || Problem::SubTokenTile == 2 ||
+                      Problem::SubTokenTile == 4);
+
+        const topk_id_t* p_topk_ids = reinterpret_cast<const topk_id_t*>(kargs.p_topk_ids);
+        IndexType* p_expert_mesh    = reinterpret_cast<IndexType*>(kargs.p_expert_mesh);
+        index_t total_elem = kargs.tokens * kargs.topk_mdiv.divisor / Problem::SubTokenTile;
+
+#pragma unroll Problem::SubTokenTile
+        for(index_t i = blockIdx.x * BLOCK_SIZE + threadIdx.x; i < total_elem; i += blockDim.x)
+        {
+            auto x = p_topk_ids[i];
+            static_for<0, Problem::SubTokenTile, 1>{}([&](auto j) {
+                IndexType eid = x[j.value]; // ext_vector_type must use int to []
+                uint32_t curr_token_id, curr_topk_id;
+                kargs.topk_mdiv.divmod(i * Problem::SubTokenTile + j, curr_token_id, curr_topk_id);
+                p_expert_mesh[eid * kargs.mesh_stride + curr_token_id] = curr_topk_id + 1;
+            });
+        }
+    }
+};
+
+// cnt total tokens for a expert
+template <typename Problem_>
+struct MoeSortingMultiPhaseKernel_P1
+{
+    using Problem = remove_cvref_t<Problem_>;
+
+    using IndexType  = typename Problem::IndexType;
+    using WeightType = typename Problem::WeightType;
+
+    static constexpr index_t BLOCK_SIZE = 256;
+    static constexpr index_t OCCUPANCY  = 2; // hard coded
+
+    typedef MoeSortingHostArgs MoeSortingKargs;
+
+    using Hargs = MoeSortingHostArgs;
+    struct Kargs
+    {
+        const void* p_local_expert_mask; // [expert]
+        void* p_expert_mesh;             // [expert, tokens]
+        void* p_expert_cumsum;
+        index_t mesh_stride; // mesh_stride for p_expert_mesh
+    };
+
+    CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h)
+    {
+        Kargs k;
+        k.p_local_expert_mask = h.p_local_expert_mask;
+        k.p_expert_mesh       = h.p_ws;
+        k.p_expert_cumsum =
+            reinterpret_cast<void*>(reinterpret_cast<IndexType*>(h.p_ws) +
+                                    impl::moe_sorting_mp_mesh_elem(h.tokens, h.num_experts));
+        k.mesh_stride = impl::moe_sorting_mp_mesh_stride(h.tokens);
+
+        return k;
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs& h) { return dim3(h.num_experts); }
+
+    CK_TILE_HOST static constexpr auto BlockSize(const Hargs&) { return dim3(BLOCK_SIZE); }
+
+    // in byte
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemSize()
+    {
+        return BLOCK_SIZE / warpSize * sizeof(IndexType);
+    }
+
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        __shared__ char smem[GetSmemSize()];
+
+        int eid = blockIdx.x;
+
+        constexpr index_t index_pack = 4;                                   // always packed
+        using r_t                    = ext_vector_t<IndexType, index_pack>; // always use int32x4
+        r_t* p_expert_mesh           = reinterpret_cast<r_t*>(
+            reinterpret_cast<index_t*>(kargs.p_expert_mesh) + eid * kargs.mesh_stride);
+
+        static_assert(Problem::SubTokenTile == 1 || Problem::SubTokenTile == 2 ||
+                      Problem::SubTokenTile == 4);
+        const IndexType* p_local_expert_mask =
+            static_cast<const IndexType*>(kargs.p_local_expert_mask);
+        IndexType* p_expert_cumsum = reinterpret_cast<IndexType*>(kargs.p_expert_cumsum);
+
+        auto f_sum = [](auto x_, auto y_) { return x_ + y_; };
+
+        int loops = (kargs.mesh_stride / index_pack + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+        if constexpr(Problem::LocalExpertMasking)
+        {
+            IndexType mask = p_local_expert_mask[eid];
+            if(mask == 0)
+                return; // skip
+        }
+
+        index_t cnt = 0; // per-wave cnt
+        for(int i = 0; i < loops; i++)
+        {
+            int position = i * BLOCK_SIZE + threadIdx.x;
+            r_t v{0};
+            if(position < (kargs.mesh_stride / index_pack))
+                v = p_expert_mesh[position];
+            index_t local_sum = 0;
+            static_for<0, index_pack, 1>{}(
+                [&](auto i_vec) { local_sum += v[i_vec.value] != 0 ? 1 : 0; });
+            cnt += impl::moe_sorting_wave_reduce(local_sum, f_sum);
+        }
+
+        index_t lane_id = threadIdx.x % warpSize;
+        index_t wave_id = threadIdx.x / warpSize;
+
+        // reduce cross wave
+        IndexType* s = reinterpret_cast<IndexType*>(smem);
+        if(lane_id == 0)
+        {
+            s[wave_id] = cnt;
+        }
+        __syncthreads();
+
+        if(threadIdx.x == 0)
+        {
+            index_t c = 0;
+            for(auto i = 0; i < (BLOCK_SIZE / warpSize); i++)
+            {
+                c += s[i];
+            }
+            p_expert_cumsum[eid] = c;
+        }
+    }
+};
+
+// token count cumsum
+template <typename Problem_>
+struct MoeSortingMultiPhaseKernel_P2
+{
+    using Problem = remove_cvref_t<Problem_>;
+
+    using IndexType  = typename Problem::IndexType;
+    using WeightType = typename Problem::WeightType;
+
+    static constexpr index_t BLOCK_SIZE = 256;
+    static constexpr index_t OCCUPANCY  = 2; // hard coded
+
+    typedef MoeSortingHostArgs MoeSortingKargs;
+
+    using Hargs = MoeSortingHostArgs;
+    struct Kargs
+    {
+        const void* p_local_expert_mask; // [expert]
+        void* p_expert_mesh;             // [expert, tokens]
+        void* p_expert_cumsum;           // [expert + 1]
+        void* p_total_tokens_post_pad;   // [1]
+        void* p_sorted_expert_ids;
+        void* p_moe_buf;
+        index_t tokens;
+        index_t num_experts;
+        index_t mesh_stride; // mesh_stride for p_expert_mesh
+        mdiv unit_size_mdiv;
+        long_index_t moe_buf_bytes;
+    };
+
+    CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h)
+    {
+        Kargs k;
+        k.p_local_expert_mask = h.p_local_expert_mask;
+        // k.p_expert_mesh                    = h.p_ws;
+        k.p_expert_cumsum =
+            reinterpret_cast<void*>(reinterpret_cast<IndexType*>(h.p_ws) +
+                                    impl::moe_sorting_mp_mesh_elem(h.tokens, h.num_experts));
+        k.p_total_tokens_post_pad = h.p_total_tokens_post_pad;
+        k.p_sorted_expert_ids     = h.p_sorted_expert_ids;
+
+        k.p_moe_buf = h.p_moe_buf;
+
+        k.tokens         = h.tokens;
+        k.num_experts    = h.num_experts;
+        k.mesh_stride    = impl::moe_sorting_mp_mesh_stride(h.tokens);
+        k.unit_size_mdiv = mdiv{static_cast<uint32_t>(h.unit_size)};
+
+        k.moe_buf_bytes = h.moe_buf_bytes;
+
+        return k;
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs& h)
+    {
+        // use 1 block to cumsum
+        return dim3(1 + ck_tile::integer_divide_ceil(h.moe_buf_bytes, BLOCK_SIZE * 16));
+    }
+
+    CK_TILE_HOST static constexpr auto BlockSize(const Hargs&) { return dim3(BLOCK_SIZE); }
+
+    // in byte
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemSize()
+    {
+        return 2 * BLOCK_SIZE * sizeof(IndexType);
+    }
+
+    // reduce single pixel within a wave
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        if(blockIdx.x > 0)
+        {
+            impl::moe_buf_set_zero_kernel<BLOCK_SIZE>(
+                reinterpret_cast<uint8x16_t*>(kargs.p_moe_buf),
+                kargs.moe_buf_bytes,
+                blockIdx.x - 1);
+            return;
+        }
+        __shared__ char smem[GetSmemSize()];
+        IndexType* s = reinterpret_cast<IndexType*>(smem);
+
+        const IndexType* p_local_expert_mask =
+            static_cast<const IndexType*>(kargs.p_local_expert_mask);
+        IndexType* p_expert_cumsum = reinterpret_cast<IndexType*>(kargs.p_expert_cumsum);
+        IndexType* p_total_tokens_post_pad =
+            reinterpret_cast<IndexType*>(kargs.p_total_tokens_post_pad);
+        IndexType* p_sorted_expert_ids = reinterpret_cast<IndexType*>(kargs.p_sorted_expert_ids);
+
+        const index_t loops = (kargs.num_experts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        index_t wave_id     = threadIdx.x / warpSize;
+        index_t lane_id     = threadIdx.x % warpSize;
+
+        IndexType prev_cumsum_a = 0;
+        IndexType prev_cumsum_b = 0;
+
+        for(index_t i = 0; i < loops; i++)
+        {
+            index_t position = i * BLOCK_SIZE + threadIdx.x;
+            IndexType a_     = 0; // token count for a expert
+            IndexType b_     = 0; // mask for a expert
+            if(position < kargs.num_experts)
+            {
+                a_ = p_expert_cumsum[position];
+                if constexpr(Problem::LocalExpertMasking)
+                    b_ = p_local_expert_mask[position];
+            }
+
+            int blocks_pers_expert =
+                kargs.unit_size_mdiv.div(a_ + kargs.unit_size_mdiv.divisor - 1);
+            // pad token
+            int padded_blocks_per_expert = [&]() {
+                int x_ = [&]() {
+                    if constexpr(Problem::SkipExpertsWithZeroTokens)
+                    {
+                        // if local_cnt is zero, blocks_pers_expert will be zero
+                        // this is what we want to achieve
+                        return blocks_pers_expert; //  * kargs.unit_size_mdiv.divisor;
+                    }
+                    else
+                    {
+                        return max(blocks_pers_expert, 1);
+                    }
+                }();
+                if constexpr(Problem::LocalExpertMasking)
+                {
+                    return b_ ? x_ : 0;
+                }
+                else
+                    return x_;
+            }();
+
+            IndexType cumsum_a = padded_blocks_per_expert;
+            IndexType cumsum_b = b_;
+
+            // Note: we first cumsum local round, then add previous cumsum
+            impl::moe_sorting_wave_cumsum<IndexType, warpSize>(cumsum_a);
+            impl::moe_sorting_wave_cumsum<IndexType, warpSize>(cumsum_b);
+
+            __syncthreads();
+            if(lane_id == warpSize - 1)
+            {
+                s[4 + wave_id]                         = cumsum_a;
+                s[4 + wave_id + BLOCK_SIZE / warpSize] = cumsum_b;
+            }
+
+            __syncthreads();
+
+            // reduce cross wave
+            static_for<0, BLOCK_SIZE / warpSize - 1, 1>{}([&](auto i_w) {
+                IndexType prev_a = s[4 + i_w];
+                IndexType prev_b = s[4 + i_w + BLOCK_SIZE / warpSize];
+                prev_a           = wave_id > i_w ? prev_a : 0; // mask out
+                prev_b           = wave_id > i_w ? prev_b : 0; // mask out
+                cumsum_a += prev_a;
+                cumsum_b += prev_b;
+            });
+
+            // Now let's add previous cumsum
+            cumsum_a += prev_cumsum_a;
+            cumsum_b += prev_cumsum_b;
+
+            if(threadIdx.x == BLOCK_SIZE - 1)
+            {
+                s[2] = cumsum_a; // store the last cumsum
+                s[3] = cumsum_b;
+            }
+
+            IndexType out_0 = cumsum_a - padded_blocks_per_expert; // exclusive cumsum tok cnt
+            IndexType out_1 = cumsum_b - b_;                       // exclusive cumsum mask cnt
+
+            __syncthreads();
+            prev_cumsum_a = s[2];
+            prev_cumsum_b = s[3];
+
+            if(position < kargs.num_experts)
+            {
+                p_expert_cumsum[position] = out_0 * kargs.unit_size_mdiv.divisor;
+            }
+
+            {
+                if constexpr(Problem::LocalExpertMasking)
+                {
+                    if(b_)
+                    {
+                        for(int j = 0; j < blocks_pers_expert; j++)
+                        {
+                            p_sorted_expert_ids[out_0 + j] = out_1;
+                        }
+                    }
+                }
+                else
+                {
+                    for(int j = 0; j < blocks_pers_expert; j++)
+                    {
+                        p_sorted_expert_ids[out_0 + j] = position;
+                    }
+                }
+            }
+        }
+
+        if(threadIdx.x == 0)
+        {
+            auto total_tokens_post_pad         = prev_cumsum_a * kargs.unit_size_mdiv.divisor;
+            p_total_tokens_post_pad[0]         = total_tokens_post_pad;
+            p_expert_cumsum[kargs.num_experts] = total_tokens_post_pad;
+        }
+    }
+};
+
+template <typename Problem_>
+struct MoeSortingMultiPhaseKernel_P3
+{
+    using Problem = remove_cvref_t<Problem_>;
+
+    using IndexType  = typename Problem::IndexType;
+    using WeightType = typename Problem::WeightType;
+
+    static constexpr index_t BLOCK_SIZE = 256;
+    static constexpr index_t OCCUPANCY  = 2; // hard coded
+
+    typedef MoeSortingHostArgs MoeSortingKargs;
+
+    using Hargs = MoeSortingHostArgs;
+
+    struct Kargs
+    {
+        const void* p_weights;
+        const void* p_local_expert_mask;
+        void* p_sorted_token_ids;
+        void* p_sorted_weights;
+        void* p_expert_mesh; // [token, expert]
+        void* p_expert_cumsum;
+
+        index_t tokens;
+        index_t num_experts;
+        index_t mesh_stride; // mesh_stride for p_expert_mesh
+        mdiv topk_mdiv;
+    };
+
+    CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h)
+    {
+        Kargs k;
+        k.p_weights           = h.p_weights;
+        k.p_local_expert_mask = h.p_local_expert_mask;
+        k.p_sorted_token_ids  = h.p_sorted_token_ids;
+        k.p_sorted_weights    = h.p_sorted_weights;
+        k.p_expert_mesh       = h.p_ws;
+        k.p_expert_cumsum =
+            reinterpret_cast<void*>(reinterpret_cast<IndexType*>(h.p_ws) +
+                                    impl::moe_sorting_mp_mesh_elem(h.tokens, h.num_experts));
+        k.tokens      = h.tokens;
+        k.num_experts = h.num_experts;
+        k.topk_mdiv   = mdiv{static_cast<uint32_t>(h.topk)};
+        k.mesh_stride = impl::moe_sorting_mp_mesh_stride(h.tokens);
+        return k;
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs& h) { return dim3(h.num_experts); }
+
+    CK_TILE_HOST static constexpr auto BlockSize(const Hargs&) { return dim3(BLOCK_SIZE); }
+
+    // in byte
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemSize()
+    {
+        return (4 + BLOCK_SIZE / warpSize) * sizeof(IndexType);
+    }
+
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        __shared__ char smem[GetSmemSize()];
+
+        const IndexType* p_local_expert_mask =
+            static_cast<const IndexType*>(kargs.p_local_expert_mask);
+        IndexType* s                  = reinterpret_cast<IndexType*>(smem);
+        IndexType* p_expert_mesh      = reinterpret_cast<IndexType*>(kargs.p_expert_mesh);
+        IndexType* p_sorted_token_ids = reinterpret_cast<IndexType*>(kargs.p_sorted_token_ids);
+        IndexType* p_expert_cumsum    = reinterpret_cast<IndexType*>(kargs.p_expert_cumsum);
+        const WeightType* p_weights   = static_cast<const WeightType*>(kargs.p_weights);
+        WeightType* p_sorted_weights  = reinterpret_cast<WeightType*>(kargs.p_sorted_weights);
+
+        static_assert(Problem::SubTokenTile == 1 || Problem::SubTokenTile == 2 ||
+                      Problem::SubTokenTile == 4);
+
+        int eid     = blockIdx.x;
+        int wave_id = threadIdx.x / warpSize;
+        int lane_id = threadIdx.x % warpSize;
+        int e_start = p_expert_cumsum[eid];
+        int e_end   = p_expert_cumsum[eid + 1];
+        if constexpr(Problem::SkipExpertsWithZeroTokens)
+        {
+            if(e_start == e_end)
+                return;
+        }
+
+        if constexpr(Problem::LocalExpertMasking)
+        {
+            int e_mask = p_local_expert_mask[eid];
+            if(e_mask == 0)
+                return; // skip empty expert
+        }
+
+        // cumsum one by one
+        int loops       = (kargs.mesh_stride + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        int prev_cumsum = 0;
+        for(int i = 0; i < loops; i++)
+        {
+            int i_token = i * BLOCK_SIZE + threadIdx.x;
+            IndexType x = 0;
+            if(i_token < kargs.tokens)
+            {
+                x = p_expert_mesh[eid * kargs.mesh_stride + i_token];
+            }
+            int i_topk = x - 1;          // topk of this token
+            int i_show = x != 0 ? 1 : 0; // has this token or not
+            int cumsum = i_show;
+            impl::moe_sorting_wave_cumsum<int, warpSize>(cumsum);
+
+            __syncthreads();
+            if(lane_id == warpSize - 1)
+            {
+                s[4 + wave_id] = cumsum;
+            }
+            __syncthreads();
+
+            // reduce cross wave
+            static_for<0, BLOCK_SIZE / warpSize - 1, 1>{}([&](auto i_w) {
+                IndexType prev = s[4 + i_w];
+                prev           = wave_id > i_w ? prev : 0; // mask out
+                cumsum += prev;
+            });
+            cumsum += prev_cumsum; // add previous round cumsum
+            if(threadIdx.x == BLOCK_SIZE - 1)
+            {
+                s[0] = cumsum;
+            }
+            __syncthreads();
+
+            int position = cumsum - i_show;
+            prev_cumsum  = s[0]; // update the last cumsum
+
+            if(i_show)
+            {
+#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
+                p_sorted_token_ids[e_start + position] = MOE_SORTING_MOCK_ID(i_token, i_topk);
+#else
+                p_sorted_token_ids[e_start + position] = i_token;
+#endif
+                p_sorted_weights[e_start + position] =
+                    p_weights[i_token * kargs.topk_mdiv.divisor + i_topk];
+            }
+        }
+
+        for(index_t i = e_start + prev_cumsum + threadIdx.x; i < e_end; i += BLOCK_SIZE)
+        {
+#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
+            p_sorted_token_ids[i] = MOE_SORTING_MOCK_ID(kargs.tokens, kargs.topk_mdiv.divisor);
+#else
+            p_sorted_token_ids[i] = tokens;
+#endif
+            p_sorted_weights[i] = static_cast<WeightType>(0.0);
+        }
+    }
+};
+
 #undef MOE_SORTING_MOCK_ID
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_problem.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_problem.hpp
index 15effe7118..a98e0d7652 100644
--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_problem.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_problem.hpp
@@ -49,4 +49,21 @@ struct MoeSortingProblemEx
     static constexpr index_t ExpertTile = ExpertTile_; // TODO: only used in store out
 };
 
+template <typename IndexType_,
+          typename WeightType_,
+          index_t SubTokenTile_,    // 1,2,4
+          bool LocalExpertMasking_, // used in EP case
+          bool SkipExpertsWithZeroTokens_ = true>
+struct MoeSortingProblemMp
+{
+    // TODO: this kernel only support warp per row
+    using WeightType = remove_cvref_t<WeightType_>;
+    using IndexType  = remove_cvref_t<IndexType_>;
+
+    static constexpr index_t SubTokenTile           = SubTokenTile_;
+    static constexpr bool LocalExpertMasking        = LocalExpertMasking_;
+    static constexpr bool SkipExpertsWithZeroTokens = SkipExpertsWithZeroTokens_;
+    static_assert(SubTokenTile == 1 || SubTokenTile == 2 || SubTokenTile == 4);
+};
+
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp
index 6e817fca27..38410721ae 100644
--- a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp
+++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp
@@ -207,6 +207,7 @@ struct FusedMoeGemmPipeline_FlatmmUk
                        threadIdx.x % (BlockShape::Block_K0 / kAlignmentA) * kAlignmentA;
             },
             number<row_ids_a.size()>{});
+
         auto a_res =
             make_wave_buffer_resource(reinterpret_cast<const ADataType*>(kargs.a_ptr),
                                       kargs.num_tokens * kargs.stride_token * sizeof(ADataType));
@@ -318,10 +319,10 @@ struct FusedMoeGemmPipeline_FlatmmUk
                                            {0, 0},
                                            dist_);
         }();
+
         auto o_res =
             make_wave_buffer_resource(reinterpret_cast<const ODataType*>(kargs.o_ptr),
                                       kargs.num_tokens * kargs.stride_token * sizeof(ODataType));
-
         auto row_coords_o = GetRowCoords_O(sorted_tile_id * BlockShape::Block_M0);
         auto w_scale      = GetWeightScale(
             row_coords_o, reinterpret_cast<const TopkWeightDataType*>(kargs.sorted_weight_ptr));
diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index 794f7f21f2..35f5170179 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -44,8 +44,11 @@
 #include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac_impl.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_impl.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_smfmac_impl.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
index d9d6739fb5..c4d527da63 100644
--- a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
@@ -68,25 +68,19 @@ struct BlockUniversalGemmAsBsCr
         static constexpr index_t NPerBlockPerIter = NWarp * WarpGemm::kN;
         static constexpr index_t KPerBlockPerIter = WarpGemm::kK;
 
-        using AWarpTileDistr = remove_cvref_t<decltype(make_static_tile_distribution(
-            typename WarpGemm::AWarpDstrEncoding{}))>;
-        using BWarpTileDistr = remove_cvref_t<decltype(make_static_tile_distribution(
-            typename WarpGemm::BWarpDstrEncoding{}))>;
-
-        using AWarpTile = remove_cvref_t<decltype(make_static_distributed_tensor<ComputeDataType>(
-            AWarpTileDistr{}))>;
-        using BWarpTile = remove_cvref_t<decltype(make_static_distributed_tensor<ComputeDataType>(
-            BWarpTileDistr{}))>;
-
-        // TODO: Should we have two policies? Interwave & Intrawave ??
+        // Controls how many MAC clusters (MFMA blocks) we have per wave
+        // Ie if
+        // InterWaveSchedulingMacClusters = 1;
+        // KPerBlock == 32
+        // WarpGemm::kK = 8
+        // Then we would group all 4 WarpGemms into single MAC cluster.
+        // But if we would set InterWaveSchedulingMacClusters = 2, then we would
+        // split those 4 warp gemms into two groups.
         static constexpr index_t InterWaveSchedulingMacClusters = 1;
 
         // should be at least equal to: WarpGemm::Impl::kABKPerLane
-        // and the question is how to assess upper limit or exact value?
-        // TODO: Should we introduce AK1/BK1 parameters ?
-        static constexpr index_t KPack      = 8;
-        static constexpr index_t KPerThread = KIterPerWarp * KPack;
-        static constexpr index_t KRepeat    = KPerThread / KPack;
+        static constexpr index_t KPack      = WarpGemm::kKPerThread;
+        static constexpr index_t KPerThread = KIterPerWarp * WarpGemm::kKPerThread;
     };
 
     public:
@@ -108,6 +102,25 @@ struct BlockUniversalGemmAsBsCr
 
     static constexpr auto Scheduler = Traits::Scheduler;
 
+    using AWarpDstr = typename WarpGemm::AWarpDstr;
+    using BWarpDstr = typename WarpGemm::BWarpDstr;
+    using CWarpDstr = typename WarpGemm::CWarpDstr;
+
+    using AWarpTensor = typename WarpGemm::AWarpTensor;
+    using BWarpTensor = typename WarpGemm::BWarpTensor;
+    using CWarpTensor = typename WarpGemm::CWarpTensor;
+
+    static constexpr auto a_warp_y_lengths =
+        to_sequence(AWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+    static constexpr auto b_warp_y_lengths =
+        to_sequence(BWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+    static constexpr auto c_warp_y_lengths =
+        to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+
+    static constexpr auto a_warp_y_index_zeros = uniform_sequence_gen_t<AWarpDstr::NDimY, 0>{};
+    static constexpr auto b_warp_y_index_zeros = uniform_sequence_gen_t<BWarpDstr::NDimY, 0>{};
+    static constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
+
     static constexpr index_t APackedSize =
         ck_tile::numeric_traits<remove_cvref_t<ADataType>>::PackedSize;
     static constexpr index_t BPackedSize =
@@ -116,18 +129,67 @@ struct BlockUniversalGemmAsBsCr
     using I0 = number<0>;
     using I1 = number<1>;
 
+    CK_TILE_DEVICE static constexpr auto MakeABlockDistributionEncode()
+    {
+        constexpr index_t KPerThread     = Traits::KPerThread;
+        constexpr index_t NumMacClusters = Traits::InterWaveSchedulingMacClusters;
+        constexpr index_t KPerInnerLoop =
+            ck_tile::max(KPerThread / NumMacClusters, WarpGemm::kKPerThread);
+        constexpr index_t KIterInterwave = KPerInnerLoop / WarpGemm::kKPerThread;
+
+        using KIterSeq = std::conditional_t<Scheduler == GemmPipelineScheduler::Interwave,
+                                            sequence<KIterInterwave>,
+                                            sequence<KIterPerWarp>>;
+
+        constexpr auto a_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<NWarp>,
+                                       tuple<sequence<MIterPerWarp, MWarp>, KIterSeq>,
+                                       tuple<sequence<1, 0>>,
+                                       tuple<sequence<1, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+        constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            a_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+
+        return a_block_dstr_encode;
+    }
+
+    CK_TILE_DEVICE static constexpr auto MakeBBlockDistributionEncode()
+    {
+        constexpr index_t KPerThread     = Traits::KPerThread;
+        constexpr index_t NumMacClusters = Traits::InterWaveSchedulingMacClusters;
+        constexpr index_t KPerInnerLoop =
+            ck_tile::max(KPerThread / NumMacClusters, WarpGemm::kKPerThread);
+        constexpr index_t KIterInterwave = KPerInnerLoop / WarpGemm::kKPerThread;
+
+        using KIterSeq = std::conditional_t<Scheduler == GemmPipelineScheduler::Interwave,
+                                            sequence<KIterInterwave>,
+                                            sequence<KIterPerWarp>>;
+
+        constexpr auto b_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<MWarp>,
+                                       tuple<sequence<NIterPerWarp, NWarp>, KIterSeq>,
+                                       tuple<sequence<0, 1>>,
+                                       tuple<sequence<0, 1>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+        constexpr auto b_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            b_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
+
+        return b_block_dstr_encode;
+    }
+
     private:
     template <typename WarpWindow, typename WarpTile>
-    CK_TILE_DEVICE static void load_interleaved_pk_type(const WarpWindow& warp_window,
-                                                        WarpTile& warp_tile)
+    CK_TILE_DEVICE static void load_interleaved_pk_type(WarpTile& warp_tile,
+                                                        const WarpWindow& warp_window)
     {
         constexpr index_t UnaryOpSize = 8;
         const element_wise::PassThroughPack8 elementwise_op{};
-        constexpr index_t thread_buffer_size =
-            Traits::AWarpTile::get_thread_buffer_size() / UnaryOpSize;
-        const auto in_dstr_tensors = load_tile(warp_window);
+        constexpr index_t thread_buffer_size = WarpTile::get_thread_buffer_size() / UnaryOpSize;
+        const auto in_dstr_tensors           = load_tile(warp_window);
 
-        static_assert(Traits::AWarpTile::get_thread_buffer_size() % UnaryOpSize == 0);
+        static_assert(WarpTile::get_thread_buffer_size() % UnaryOpSize == 0);
 
         using ComputeVectorType = ComputeDataType __attribute__((ext_vector_type(UnaryOpSize)));
         static_for<0, thread_buffer_size, 1>{}([&](auto i) {
@@ -144,6 +206,17 @@ struct BlockUniversalGemmAsBsCr
     template <typename GemmTraits>
     struct BlockGemmImpl<GemmPipelineScheduler::Default, GemmTraits>
     {
+        static constexpr auto ALdsTileDistr =
+            decltype(make_static_tile_distribution(MakeABlockDistributionEncode())){};
+        static constexpr auto BLdsTileDistr =
+            decltype(make_static_tile_distribution(MakeBBlockDistributionEncode())){};
+
+        using ALdsTile = decltype(make_static_distributed_tensor<ComputeDataType>(ALdsTileDistr));
+        using BLdsTile = decltype(make_static_distributed_tensor<ComputeDataType>(BLdsTileDistr));
+
+        ALdsTile a_warp_tile_;
+        ALdsTile b_warp_tile_;
+
         // C += A * B
         template <typename CBlockTensor, typename ASmemBlockWindow, typename BSmemBlockWindow>
         CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor,
@@ -158,114 +231,39 @@ struct BlockUniversalGemmAsBsCr
                           "The ADataType and BDataType as defined in "
                           "traits should be the same as correspoinding block window data type!");
 
-            static_assert(
-                GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[I0{}] &&
-                    GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[I0{}] &&
-                    GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[I1{}],
-                "MPerBlock, NPerBlock, KPerBlock defined in "
-                " BlockGemmShape are different from A/B block smem windows apropriate dims!");
-
-            const index_t iMWarp = get_warp_id() / NWarp;
-            const index_t iNWarp = get_warp_id() - (iMWarp * NWarp);
-
-            // TODO: refactor warp_window tile type to class member as it should be
-            // compile-time known information.
-            auto a_warp_window_tmp = make_tile_window(
-                a_block_window.get_bottom_tensor_view(),
-                make_tuple(number<WarpGemm::kM>{}, number<WarpGemm::kK>{}),
-                a_block_window.get_window_origin() + multi_index<2>{iMWarp * WarpGemm::kM, 0},
-                make_static_tile_distribution(typename WarpGemm::AWarpDstrEncoding{}));
-
-            using AWarpWindow = remove_cvref_t<decltype(a_warp_window_tmp)>;
-
-            static_assert(GemmTraits::AWarpTile::get_num_of_dimension() ==
-                              AWarpWindow::get_num_of_dimension(),
-                          "AWarpWindow number of dimensions must be equal to "
-                          "AWarpTile number of dimensions!");
-            static_assert(GemmTraits::AWarpTile::get_lengths() ==
-                              AWarpWindow{}.get_window_lengths(),
-                          "AWarpWindow lengths must be equal to AWarpTile lengths!");
-
-            statically_indexed_array<
-                statically_indexed_array<AWarpWindow, GemmTraits::KIterPerWarp>,
-                MIterPerWarp>
-                a_warp_windows;
-
-            // construct B-warp-window
-            auto b_warp_window_tmp = make_tile_window(
-                b_block_window.get_bottom_tensor_view(),
-                make_tuple(number<WarpGemm::kN>{}, number<WarpGemm::kK>{}),
-                b_block_window.get_window_origin() + multi_index<2>{iNWarp * WarpGemm::kN, 0},
-                make_static_tile_distribution(typename WarpGemm::BWarpDstrEncoding{}));
-
-            using BWarpWindow = remove_cvref_t<decltype(b_warp_window_tmp)>;
-
-            static_assert(GemmTraits::BWarpTile::get_num_of_dimension() ==
-                              BWarpWindow::get_num_of_dimension(),
-                          "BWarpWindow number of dimensions must be equal to "
-                          "BWarpTile number of dimensions!");
-            static_assert(GemmTraits::BWarpTile::get_lengths() ==
-                              BWarpWindow{}.get_window_lengths(),
-                          "BWarpWindow lengths must be equal to BWarpTile lengths!");
-
-            statically_indexed_array<
-                statically_indexed_array<BWarpWindow, GemmTraits::KIterPerWarp>,
-                NIterPerWarp>
-                b_warp_windows;
-
-            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
-                    a_warp_windows(mIter)(kIter) = a_warp_window_tmp;
-
-                    // TODO: I don't have to move 0,0 window!
-                    move_tile_window(a_warp_windows(mIter)(kIter),
-                                     {mIter * GemmTraits::MPerBlockPerIter,
-                                      kIter * GemmTraits::KPerBlockPerIter});
-                });
-            });
-
-            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
-                    b_warp_windows(nIter)(kIter) = b_warp_window_tmp;
-
-                    move_tile_window(b_warp_windows(nIter)(kIter),
-                                     {nIter * GemmTraits::NPerBlockPerIter,
-                                      kIter * GemmTraits::KPerBlockPerIter});
-                });
-            });
-
-            using CWarpDstr   = typename WarpGemm::CWarpDstr;
-            using AWarpTensor = typename WarpGemm::AWarpTensor;
-            using BWarpTensor = typename WarpGemm::BWarpTensor;
-            using CWarpTensor = typename WarpGemm::CWarpTensor;
-
-            constexpr auto c_warp_y_lengths =
-                to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
-            constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
-
+            if constexpr(std::is_same_v<ADataType, pk_int4_t>)
+            {
+                load_interleaved_pk_type(a_warp_tile_, a_block_window);
+            }
+            else
+            {
+                load_tile(a_warp_tile_, a_block_window);
+            }
+            if constexpr(std::is_same_v<BDataType, pk_int4_t>)
+            {
+                load_interleaved_pk_type(b_warp_tile_, b_block_window);
+            }
+            else
+            {
+                load_tile(b_warp_tile_, b_block_window);
+            }
             // hot loop:
             static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
                 static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    AWarpTensor a_warp_tile;
-                    if constexpr(std::is_same_v<ADataType, pk_int4_t>)
-                    {
-                        load_interleaved_pk_type(a_warp_windows(mIter)(kIter), a_warp_tile);
-                    }
-                    else
-                    {
-                        a_warp_tile = load_tile(a_warp_windows(mIter)(kIter));
-                    }
+                    // read A warp tensor from A block tensor
+                    AWarpTensor a_warp_tensor;
+
+                    a_warp_tensor.get_thread_buffer() = a_warp_tile_.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
 
                     static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                        BWarpTensor b_warp_tile;
-                        if constexpr(std::is_same_v<BDataType, pk_int4_t>)
-                        {
-                            load_interleaved_pk_type(b_warp_windows(nIter)(kIter), b_warp_tile);
-                        }
-                        else
-                        {
-                            b_warp_tile = load_tile(b_warp_windows(nIter)(kIter));
-                        }
+                        // read B warp tensor from B block tensor
+                        BWarpTensor b_warp_tensor;
+
+                        b_warp_tensor.get_thread_buffer() = b_warp_tile_.get_y_sliced_thread_data(
+                            merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
 
                         // read C warp tensor from C block tensor-
                         CWarpTensor c_warp_tensor;
@@ -275,7 +273,7 @@ struct BlockUniversalGemmAsBsCr
                             merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
                         // warp GEMM
-                        WarpGemm{}(c_warp_tensor, a_warp_tile, b_warp_tile);
+                        WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
 
                         // write C warp tensor into C block tensor
                         c_block_tensor.set_y_sliced_thread_data(
@@ -291,149 +289,68 @@ struct BlockUniversalGemmAsBsCr
     template <typename GemmTraits>
     struct BlockGemmImpl<GemmPipelineScheduler::Intrawave, GemmTraits>
     {
-        statically_indexed_array<
-            statically_indexed_array<typename GemmTraits::AWarpTile, KIterPerWarp>,
-            MIterPerWarp>
-            a_warp_tiles_;
+        static constexpr auto ALdsTileDistr =
+            decltype(make_static_tile_distribution(MakeABlockDistributionEncode())){};
+        static constexpr auto BLdsTileDistr =
+            decltype(make_static_tile_distribution(MakeBBlockDistributionEncode())){};
 
-        statically_indexed_array<
-            statically_indexed_array<typename GemmTraits::BWarpTile, KIterPerWarp>,
-            NIterPerWarp>
-            b_warp_tiles_;
+        using ALdsTile = decltype(make_static_distributed_tensor<ComputeDataType>(ALdsTileDistr));
+        using BLdsTile = decltype(make_static_distributed_tensor<ComputeDataType>(BLdsTileDistr));
+
+        ALdsTile a_warp_tile_;
+        BLdsTile b_warp_tile_;
 
         template <typename ASmemBlockWindow, typename BSmemBlockWindow>
         CK_TILE_DEVICE void LocalPrefetch(const ASmemBlockWindow& a_block_window,
                                           const BSmemBlockWindow& b_block_window)
         {
-            static_assert(
-                GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[I0{}] &&
-                    GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[I0{}] &&
-                    GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[I1{}],
-                "MPerBlock, NPerBlock, KPerBlock defined in "
-                " BlockGemmShape are different from A/B block smem windows apropriate dims!");
-
-            static_assert(std::is_same_v<ADataType, typename ASmemBlockWindow::DataType> &&
-                              std::is_same_v<BDataType, typename BSmemBlockWindow::DataType>,
-                          "The ADataType and BDataType as defined in "
-                          "traits should be the same as correspoinding block window data type!");
-
-            const index_t iMWarp = get_warp_id() / NWarp;
-            const index_t iNWarp = get_warp_id() - (iMWarp * NWarp);
-
-            // TODO: refactor warp_window tile type to class member as it should be
-            // compile-time known information.
-            auto a_warp_window_tmp = make_tile_window(
-                a_block_window.get_bottom_tensor_view(),
-                make_tuple(number<WarpGemm::kM>{}, number<WarpGemm::kK>{}),
-                a_block_window.get_window_origin() + multi_index<2>{iMWarp * WarpGemm::kM, 0},
-                make_static_tile_distribution(typename WarpGemm::AWarpDstrEncoding{}));
-
-            using AWarpWindow = remove_cvref_t<decltype(a_warp_window_tmp)>;
-
-            static_assert(GemmTraits::AWarpTile::get_num_of_dimension() ==
-                              AWarpWindow::get_num_of_dimension(),
-                          "AWarpWindow number of dimensions must be equal to "
-                          "AWarpTile number of dimensions!");
-            static_assert(GemmTraits::AWarpTile::get_lengths() ==
-                              AWarpWindow{}.get_window_lengths(),
-                          "AWarpWindow lengths must be equal to AWarpTile lengths!");
-
-            statically_indexed_array<statically_indexed_array<AWarpWindow, KIterPerWarp>,
-                                     MIterPerWarp>
-                a_warp_windows;
-
-            // construct B-warp-window
-            auto b_warp_window_tmp = make_tile_window(
-                b_block_window.get_bottom_tensor_view(),
-                make_tuple(number<WarpGemm::kN>{}, number<WarpGemm::kK>{}),
-                b_block_window.get_window_origin() + multi_index<2>{iNWarp * WarpGemm::kN, 0},
-                make_static_tile_distribution(typename WarpGemm::BWarpDstrEncoding{}));
-
-            using BWarpWindow = remove_cvref_t<decltype(b_warp_window_tmp)>;
-
-            static_assert(GemmTraits::BWarpTile::get_num_of_dimension() ==
-                              BWarpWindow::get_num_of_dimension(),
-                          "BWarpWindow number of dimensions must be equal to "
-                          "BWarpTile number of dimensions!");
-            static_assert(GemmTraits::BWarpTile::get_lengths() ==
-                              BWarpWindow{}.get_window_lengths(),
-                          "BWarpWindow lengths must be equal to BWarpTile lengths!");
-
-            statically_indexed_array<statically_indexed_array<BWarpWindow, KIterPerWarp>,
-                                     NIterPerWarp>
-                b_warp_windows;
-
-            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                    a_warp_windows(mIter)(kIter) = a_warp_window_tmp;
-
-                    // TODO: I don't have to move 0,0 window!
-                    move_tile_window(a_warp_windows(mIter)(kIter),
-                                     {mIter * GemmTraits::MPerBlockPerIter,
-                                      kIter * GemmTraits::KPerBlockPerIter});
-                });
-            });
-
-            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                    b_warp_windows(nIter)(kIter) = b_warp_window_tmp;
-
-                    move_tile_window(b_warp_windows(nIter)(kIter),
-                                     {nIter * GemmTraits::NPerBlockPerIter,
-                                      kIter * GemmTraits::KPerBlockPerIter});
-                });
-            });
-
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    // read A warp tensor from A block window
-                    if constexpr(std::is_same_v<ADataType, pk_int4_t>)
-                    {
-                        load_interleaved_pk_type(a_warp_windows(mIter)(kIter),
-                                                 a_warp_tiles_(mIter)(kIter));
-                    }
-                    else
-                    {
-                        a_warp_tiles_(mIter)(kIter) = load_tile(a_warp_windows(mIter)(kIter));
-                    }
-                });
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    // read B warp tensor from B Block window
-                    if constexpr(std::is_same_v<BDataType, pk_int4_t>)
-                    {
-                        load_interleaved_pk_type(b_warp_windows(nIter)(kIter),
-                                                 b_warp_tiles_(nIter)(kIter));
-                    }
-                    else
-                    {
-                        b_warp_tiles_(nIter)(kIter) = load_tile(b_warp_windows(nIter)(kIter));
-                    }
-                });
-            });
+            if constexpr(std::is_same_v<ADataType, pk_int4_t>)
+            {
+                load_interleaved_pk_type(a_warp_tile_, a_block_window);
+            }
+            else
+            {
+                load_tile(a_warp_tile_, a_block_window);
+            }
+            if constexpr(std::is_same_v<BDataType, pk_int4_t>)
+            {
+                load_interleaved_pk_type(b_warp_tile_, b_block_window);
+            }
+            else
+            {
+                load_tile(b_warp_tile_, b_block_window);
+            }
         }
 
         // C += A * B
         template <typename CBlockTensor, typename ASmemBlockWindow, typename BSmemBlockWindow>
         CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor,
-                                       [[maybe_unused]] const ASmemBlockWindow& a_block_window,
-                                       [[maybe_unused]] const BSmemBlockWindow& b_block_window)
+                                       [[maybe_unused]] ASmemBlockWindow& a_block_window,
+                                       [[maybe_unused]] BSmemBlockWindow& b_block_window)
         {
             static_assert(std::is_same_v<CDataType, typename CBlockTensor::DataType>,
                           "The CDataType as defined in traits should be the same as correspoinding "
                           "C block tensor data type!");
 
-            using CWarpDstr   = typename WarpGemm::CWarpDstr;
-            using CWarpTensor = typename WarpGemm::CWarpTensor;
-
-            constexpr auto c_warp_y_lengths =
-                to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
-            constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
-
             // hot loop:
             static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
                 static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                    // read A warp tensor from A block tensor
+                    AWarpTensor a_warp_tensor;
+
+                    a_warp_tensor.get_thread_buffer() = a_warp_tile_.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+
                     static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                        // read C warp tensor from C block tensor-
+                        // read B warp tensor from B block tensor
+                        BWarpTensor b_warp_tensor;
+
+                        b_warp_tensor.get_thread_buffer() = b_warp_tile_.get_y_sliced_thread_data(
+                            merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
+
+                        // read C warp tensor from C block tensor
                         CWarpTensor c_warp_tensor;
 
                         c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
@@ -441,9 +358,7 @@ struct BlockUniversalGemmAsBsCr
                             merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
                         // warp GEMM
-                        WarpGemm{}(c_warp_tensor,
-                                   a_warp_tiles_[mIter][kIter],
-                                   b_warp_tiles_[nIter][kIter]);
+                        WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
 
                         // write C warp tensor into C block tensor
                         c_block_tensor.set_y_sliced_thread_data(
@@ -462,132 +377,57 @@ struct BlockUniversalGemmAsBsCr
         static constexpr index_t KPerThread     = GemmTraits::KPerThread;
         static constexpr index_t NumMacClusters = GemmTraits::InterWaveSchedulingMacClusters;
         static constexpr index_t KPerInnerLoop =
-            ck_tile::max(KPerThread / NumMacClusters, GemmTraits::KPack);
-        // TODO: do we really need this?? Are there any cases when this would be >=1 ??
-        // Would we need InterWaveSchedulingMacClusters > 1 ???
+            ck_tile::max(KPerThread / NumMacClusters, WarpGemm::kKPerThread);
         static constexpr index_t KRepeat        = KPerThread / KPerInnerLoop;
-        static constexpr index_t KInnerLoopIter = KPerInnerLoop / GemmTraits::KPack;
+        static constexpr index_t KInnerLoopIter = KPerInnerLoop / WarpGemm::kKPerThread;
 
-        statically_indexed_array<
-            statically_indexed_array<typename GemmTraits::AWarpTile, KInnerLoopIter>,
-            MIterPerWarp>
-            a_warp_tiles_;
+        static constexpr auto ALdsTileDistr =
+            decltype(make_static_tile_distribution(MakeABlockDistributionEncode())){};
+        static constexpr auto BLdsTileDistr =
+            decltype(make_static_tile_distribution(MakeBBlockDistributionEncode())){};
 
-        statically_indexed_array<
-            statically_indexed_array<typename GemmTraits::BWarpTile, KInnerLoopIter>,
-            NIterPerWarp>
-            b_warp_tiles_;
+        using ALdsTile = decltype(make_static_distributed_tensor<ComputeDataType>(ALdsTileDistr));
+        using BLdsTile = decltype(make_static_distributed_tensor<ComputeDataType>(BLdsTileDistr));
+
+        ALdsTile a_warp_tile_;
+        ALdsTile b_warp_tile_;
 
         template <index_t KIdx, typename ASmemBlockWindow, typename BSmemBlockWindow>
         CK_TILE_DEVICE void LocalPrefetch(const ASmemBlockWindow& a_block_window,
                                           const BSmemBlockWindow& b_block_window)
         {
-            static_assert(
-                GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[I0{}] &&
-                    GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[I0{}] &&
-                    GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[I1{}],
-                "MPerBlock, NPerBlock, KPerBlock defined in "
-                " BlockGemmShape are different from A/B block smem windows apropriate dims!");
+            constexpr auto a_lds_load_tile_distr =
+                make_static_tile_distribution(MakeABlockDistributionEncode());
+            constexpr auto b_lds_load_tile_distr =
+                make_static_tile_distribution(MakeBBlockDistributionEncode());
 
-            static_assert(std::is_same_v<ADataType, typename ASmemBlockWindow::DataType> &&
-                              std::is_same_v<BDataType, typename BSmemBlockWindow::DataType>,
-                          "The ADataType and BDataType as defined in "
-                          "traits should be the same as correspoinding block window data type!");
-
-            const index_t iMWarp = get_warp_id() / NWarp;
-            const index_t iNWarp = get_warp_id() - (iMWarp * NWarp);
-
-            // TODO: refactor warp_window tile type to class member as it should be
-            // compile-time known information.
-            auto a_warp_window_tmp = make_tile_window(
+            auto a_lds_gemm_window = make_tile_window(
                 a_block_window.get_bottom_tensor_view(),
-                make_tuple(number<WarpGemm::kM>{}, number<WarpGemm::kK>{}),
-                a_block_window.get_window_origin() +
-                    multi_index<2>{iMWarp * WarpGemm::kM, KIdx * KPerInnerLoop},
-                make_static_tile_distribution(typename WarpGemm::AWarpDstrEncoding{}));
-
-            using AWarpWindow = remove_cvref_t<decltype(a_warp_window_tmp)>;
-
-            static_assert(GemmTraits::AWarpTile::get_num_of_dimension() ==
-                              AWarpWindow::get_num_of_dimension(),
-                          "AWarpWindow number of dimensions must be equal to "
-                          "AWarpTile number of dimensions!");
-            static_assert(GemmTraits::AWarpTile::get_lengths() ==
-                              AWarpWindow{}.get_window_lengths(),
-                          "AWarpWindow lengths must be equal to AWarpTile lengths!");
-
-            statically_indexed_array<statically_indexed_array<AWarpWindow, KInnerLoopIter>,
-                                     MIterPerWarp>
-                a_warp_windows;
-
-            // construct B-warp-window
-            auto b_warp_window_tmp = make_tile_window(
+                make_tuple(number<GemmTraits::MPerBlock>{}, number<KPerInnerLoop>{}),
+                {0, KIdx * KPerInnerLoop},
+                a_lds_load_tile_distr);
+            auto b_lds_gemm_window = make_tile_window(
                 b_block_window.get_bottom_tensor_view(),
-                make_tuple(number<WarpGemm::kN>{}, number<WarpGemm::kK>{}),
-                b_block_window.get_window_origin() +
-                    multi_index<2>{iNWarp * WarpGemm::kN, KIdx * KPerInnerLoop},
-                make_static_tile_distribution(typename WarpGemm::BWarpDstrEncoding{}));
+                make_tuple(number<GemmTraits::NPerBlock>{}, number<KPerInnerLoop>{}),
+                {0, KIdx * KPerInnerLoop},
+                b_lds_load_tile_distr);
 
-            using BWarpWindow = remove_cvref_t<decltype(b_warp_window_tmp)>;
-
-            static_assert(GemmTraits::BWarpTile::get_num_of_dimension() ==
-                              BWarpWindow::get_num_of_dimension(),
-                          "BWarpWindow number of dimensions must be equal to "
-                          "BWarpTile number of dimensions!");
-            static_assert(GemmTraits::BWarpTile::get_lengths() ==
-                              BWarpWindow{}.get_window_lengths(),
-                          "BWarpWindow lengths must be equal to BWarpTile lengths!");
-
-            statically_indexed_array<statically_indexed_array<BWarpWindow, KInnerLoopIter>,
-                                     NIterPerWarp>
-                b_warp_windows;
-
-            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                static_for<0, KInnerLoopIter, 1>{}([&](auto kIter) {
-                    a_warp_windows(mIter)(kIter) = a_warp_window_tmp;
-
-                    move_tile_window(a_warp_windows(mIter)(kIter),
-                                     {mIter * GemmTraits::MPerBlockPerIter,
-                                      kIter * GemmTraits::KPerBlockPerIter});
-                });
-            });
-
-            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                static_for<0, KInnerLoopIter, 1>{}([&](auto kIter) {
-                    b_warp_windows(nIter)(kIter) = b_warp_window_tmp;
-
-                    move_tile_window(b_warp_windows(nIter)(kIter),
-                                     {nIter * GemmTraits::NPerBlockPerIter,
-                                      kIter * GemmTraits::KPerBlockPerIter});
-                });
-            });
-
-            // TODO check if a_warp_tiles has same desc as a_warp_window
-            static_for<0, KInnerLoopIter, 1>{}([&](auto kIter) {
-                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    if constexpr(std::is_same_v<ADataType, pk_int4_t>)
-                    {
-                        load_interleaved_pk_type(a_warp_windows(mIter)(kIter),
-                                                 a_warp_tiles_(mIter)(kIter));
-                    }
-                    else
-                    {
-                        a_warp_tiles_(mIter)(kIter) = load_tile(a_warp_windows(mIter)(kIter));
-                    }
-                });
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    // read B warp tensor from B Block window
-                    if constexpr(std::is_same_v<BDataType, pk_int4_t>)
-                    {
-                        load_interleaved_pk_type(b_warp_windows(nIter)(kIter),
-                                                 b_warp_tiles_(nIter)(kIter));
-                    }
-                    else
-                    {
-                        b_warp_tiles_(nIter)(kIter) = load_tile(b_warp_windows(nIter)(kIter));
-                    }
-                });
-            });
+            if constexpr(std::is_same_v<ADataType, pk_int4_t>)
+            {
+                load_interleaved_pk_type(a_warp_tile_, a_block_window);
+            }
+            else
+            {
+                load_tile(a_warp_tile_, a_lds_gemm_window);
+            }
+            if constexpr(std::is_same_v<BDataType, pk_int4_t>)
+            {
+                load_interleaved_pk_type(b_warp_tile_, b_block_window);
+            }
+            else
+            {
+                load_tile(b_warp_tile_, b_lds_gemm_window);
+            }
         }
 
         // C += A * B
@@ -600,13 +440,6 @@ struct BlockUniversalGemmAsBsCr
                           "The CDataType as defined in traits should be the same as correspoinding "
                           "C block tensor data type!");
 
-            using CWarpDstr   = typename WarpGemm::CWarpDstr;
-            using CWarpTensor = typename WarpGemm::CWarpTensor;
-
-            constexpr auto c_warp_y_lengths =
-                to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
-            constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
-
             // hot loop:
             static_for<0, KRepeat, 1>{}([&](auto kIter) {
                 LocalPrefetch<kIter.value>(a_block_window, b_block_window);
@@ -626,7 +459,21 @@ struct BlockUniversalGemmAsBsCr
 
                 static_for<0, KInnerLoopIter, 1>{}([&](auto kInnerIter) {
                     static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                        // read A warp tensor from A block tensor
+                        AWarpTensor a_warp_tensor;
+
+                        a_warp_tensor.get_thread_buffer() = a_warp_tile_.get_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, kInnerIter>{}, a_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
                         static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                            // read B warp tensor from B block tensor
+                            BWarpTensor b_warp_tensor;
+
+                            b_warp_tensor.get_thread_buffer() =
+                                b_warp_tile_.get_y_sliced_thread_data(
+                                    merge_sequences(sequence<nIter, kInnerIter>{},
+                                                    b_warp_y_index_zeros),
+                                    merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
                             // read C warp tensor from C block tensor-
                             CWarpTensor c_warp_tensor;
 
@@ -651,9 +498,7 @@ struct BlockUniversalGemmAsBsCr
                                 __builtin_amdgcn_sched_barrier(0);
                             }
                             // warp GEMM
-                            WarpGemm{}(c_warp_tensor,
-                                       a_warp_tiles_[mIter][kInnerIter],
-                                       b_warp_tiles_[nIter][kInnerIter]);
+                            WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
 
                             // write C warp tensor into C block tensor
                             c_block_tensor.set_y_sliced_thread_data(
diff --git a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
index 323c682f2c..dfb6bfae58 100644
--- a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
@@ -46,7 +46,7 @@ struct BatchedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
 {
     using Base = GemmKernel<TilePartitioner_, GemmPipeline_, EpiloguePipeline_>;
 
-    using GemmKernelArgs = typename Base::GemmKernelArgs;
+    using GemmKernelArgs = typename ck_tile::GemmKernelArgs;
 
     using ADataType = typename Base::ADataType;
     using BDataType = typename Base::BDataType;
@@ -65,7 +65,7 @@ struct BatchedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
         using P_ = GemmPipeline;
 
         return concat('_', "gemm_batched", gemm_prec_str<ADataType, BDataType>,
-                      concat('x', P_::kMPerBlock, P_::kNPerBlock, P_::kKPerBlock), 
+                      concat('x', P_::MPerBlock, P_::NPerBlock, P_::KPerBlock), 
                       concat('x', P_::GetVectorSizeA(), P_::GetVectorSizeB(), P_::GetVectorSizeC()),
                       concat('x', P_::kPadM, P_::kPadN, P_::kPadK));
         // clang-format on
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
index 3107d07bc9..bc41f680f2 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
@@ -9,9 +9,15 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/common.hpp"
 #include "ck_tile/host/concat.hpp"
+#include "ck_tile/core/utility/env.hpp"
 
 namespace ck_tile {
 
+/// @brief The GEMM problem definition.
+///
+/// @par Overview
+///      This structure defines the GEMM problem configuration by stating all required information
+///      like M,N,K sizes and respective strides.
 struct GemmProblem
 {
     CK_TILE_HOST GemmProblem() = default;
@@ -29,6 +35,12 @@ struct GemmProblem
     index_t stride_C;
 };
 
+/// @brief The GEMM kernel host arguments.
+///
+/// @par Overview
+///      This structure is passed to @ref GemmKernel "GemmKernel" when creating kernel arguments
+///      object. It contain all necessary information required to build proper kernel argument
+///      and launch kernel on GPU.
 struct GemmHostArgs : public GemmProblem
 {
     CK_TILE_HOST GemmHostArgs() = default;
@@ -56,6 +68,69 @@ struct GemmHostArgs : public GemmProblem
     index_t k_batch;
 };
 
+/// @brief The GEMM kernel device arguments.
+struct GemmKernelArgs
+{
+    /// @brief The A input tensor's pointer to device memory.
+    const void* a_ptr;
+    /// @brief The B input tensor's pointer to device memory.
+    const void* b_ptr;
+    /// @brief The C output tensor's pointer to device memory.
+    void* c_ptr;
+    /// @brief GEMM's M dimension size.
+    index_t M;
+    /// @brief GEMM's N dimension size.
+    index_t N;
+    /// @brief GEMM's K dimension size.
+    index_t K;
+    /// @brief The distance between consecutive elements of non-contiguous dimension
+    ///        (in memory) of A tensor.
+    index_t stride_A;
+    /// @brief The distance between consecutive elements of non-contiguous dimension
+    ///        (in memory) of B tensor.
+    index_t stride_B;
+    /// @brief The distance between consecutive elements of non-contiguous dimension
+    ///        (in memory) of C tensor.
+    index_t stride_C;
+    index_t k_batch;
+};
+
+/// @brief The GEMM kernel template.
+///
+/// @paragraph Overview Overview
+///            This class provides the generic matrix multiplication kernel template. By semantic
+///            division of GEMM algorithm into following parts we achieve flexible, versatile
+///            and robust kernel implementation.
+///
+///            @li @b Prolog - The start of GEMM kernel implementation in @ref operator()
+///                function call operator" which determines the work scope of each workgroup.
+///            @li @b GemmPipeline - The core part @a "heart" of matrix multiplication algorithm.
+///                This is the place where each workgroup is loading data from global memory and
+///                carrying out dot products.
+///            @li @b Epilogue - The @a "final" part of matrix multiplication implementation
+///                 responsible for storing results to global memory. This is also the place where
+///                 any additional operator fusion may take place.
+///
+///            Additionally both @ref GemmPipeline_ "GemmPipeline" and @ref EpiloguePipeline_
+///            "EpiloguePipeline" are parameterized with so called @a Policy which determines all
+///            internal details of those functional parts. You can think of it like both gemm and
+///            epilogue pipelines provides the control-flow logic controlled by policies. Moreover
+///            the policy is responsible for definition of all necessary data layouts and thread's
+///            work distribution.
+///
+/// @tparam TilePartitioner_    The type of class providing mapping of workgroup index into the
+///                             output data tile to be calculated. It determines the workgroup to
+///                             data relationship (or in other words - which data would be
+///                             processed and calculated by which workgroup).
+/// @tparam GemmPipeline_       The type of class which provides the core part of matrix
+///                             multiplication. This class should provide implementation of data
+///                             loading from global memory and performing block-wise matrix
+///                             multiplication. You can think of it as a work done by single
+///                             workgroup point of view.
+/// @tparam EpiloguePipeline_   The type of class providing the final part of matrix
+///                             multiplication implementation. It is responsible for storing
+///                             results calculated by @ref GemmPipeline_ "GemmPipeline" to
+///                             the output C tensor in global memory.
 template <typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
 struct GemmKernel
 {
@@ -90,20 +165,6 @@ struct GemmKernel
 
     CK_TILE_HOST static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
 
-    struct GemmKernelArgs
-    {
-        const void* a_ptr;
-        const void* b_ptr;
-        void* c_ptr;
-        index_t M;
-        index_t N;
-        index_t K;
-        index_t stride_A;
-        index_t stride_B;
-        index_t stride_C;
-        index_t k_batch;
-    };
-
     CK_TILE_HOST static constexpr GemmKernelArgs MakeKernelArgs(const GemmHostArgs& hostArgs)
     {
         return GemmKernelArgs{hostArgs.a_ptr,
@@ -129,34 +190,34 @@ struct GemmKernel
                                      const std::size_t k_id = blockIdx.z)
         {
             constexpr auto K1   = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
-            const index_t K_t   = kargs.k_batch * K1;
-            const index_t KRead = (kargs.K + K_t - 1) / K_t * K1;
+            const index_t K_t   = __builtin_amdgcn_readfirstlane(kargs.k_batch * K1);
+            const index_t KRead = __builtin_amdgcn_readfirstlane((kargs.K + K_t - 1) / K_t * K1);
 
             if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
             {
-                a_k_split_offset = k_id * KRead;
+                a_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead);
             }
             else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
             {
-                a_k_split_offset = k_id * KRead * kargs.stride_A;
+                a_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_A);
             }
 
             if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
             {
-                b_k_split_offset = k_id * KRead * kargs.stride_B;
+                b_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_B);
             }
             else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
             {
-                b_k_split_offset = k_id * KRead;
+                b_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead);
             }
 
             if(k_id < static_cast<uint32_t>(kargs.k_batch - 1))
             {
-                splitted_k = KRead;
+                splitted_k = __builtin_amdgcn_readfirstlane(KRead);
             }
             else
             {
-                splitted_k = kargs.K - KRead * (kargs.k_batch - 1);
+                splitted_k = __builtin_amdgcn_readfirstlane(kargs.K - KRead * (kargs.k_batch - 1));
             }
         }
 
@@ -167,28 +228,37 @@ struct GemmKernel
 
     CK_TILE_HOST static bool IsSupportedArgument(const GemmKernelArgs& kargs)
     {
-        if constexpr(EpiloguePipeline::template GetVectorSizeC<CDataType>() % 2 != 0 &&
+        if constexpr(EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
                      is_any_of<CDataType, fp16_t, bf16_t>::value)
         {
             if(kargs.k_batch != 1)
             {
-                std::cerr << "Conditions not met for Kbatch >1 !" << std::endl;
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("Conditions not met for Kbatch >1 !");
+                }
                 return false;
             }
         }
 
         if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
         {
-            if(kargs.K % TilePartitioner::KPerBlock != 0 && GemmPipeline::kPadK == false)
+            if(kargs.K % (TilePartitioner::KPerBlock * kargs.k_batch) != 0 &&
+               GemmPipeline::kPadK == false)
             {
-                std::cerr << "Can't support K that is not a multiple of KPerBlock"
-                             " without padding!"
-                          << std::endl;
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("Can't support K that is not a multiple of k_batch * KPerBlock "
+                                  "without padding!");
+                }
                 return false;
             }
             if(kargs.K % GemmPipeline::GetVectorSizeA() != 0)
             {
-                std::cerr << "K is not a multiple of vector load size for A tensor!" << std::endl;
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("K is not a multiple of vector load size for A tensor!");
+                }
                 return false;
             }
         }
@@ -196,14 +266,19 @@ struct GemmKernel
         {
             if(kargs.M % TilePartitioner::MPerBlock != 0 && GemmPipeline::kPadM == false)
             {
-                std::cerr << "Can't support M that is not a multiple of MPerBlock"
-                             " without padding!"
-                          << std::endl;
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR(
+                        "Can't support M that is not a multiple of MPerBlock without padding!");
+                }
                 return false;
             }
             if(kargs.M % GemmPipeline::GetVectorSizeA() != 0)
             {
-                std::cerr << "M is not a multiple of vector load size for A tensor!" << std::endl;
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("M is not a multiple of vector load size for A tensor!");
+                }
                 return false;
             }
         }
@@ -212,29 +287,40 @@ struct GemmKernel
         {
             if(kargs.N % TilePartitioner::NPerBlock != 0 && GemmPipeline::kPadN == false)
             {
-                std::cerr << "Can't support N that is not a multiple of NPerBlock"
-                             " without padding!"
-                          << std::endl;
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR(
+                        "Can't support N that is not a multiple of NPerBlock without padding!");
+                }
                 return false;
             }
             if(kargs.N % GemmPipeline::GetVectorSizeB() != 0)
             {
-                std::cerr << "N is not a multiple of vector load size for B tensor!" << std::endl;
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("N is not a multiple of vector load size for B tensor!");
+                }
                 return false;
             }
         }
         else
         {
-            if(kargs.K % TilePartitioner::KPerBlock != 0 && GemmPipeline::kPadK == false)
+            if(kargs.K % (TilePartitioner::KPerBlock * kargs.k_batch) != 0 &&
+               GemmPipeline::kPadK == false)
             {
-                std::cerr << "Can't support K that is not a multiple of KPerBlock"
-                             " without padding!"
-                          << std::endl;
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("Can't support K that is not a multiple of k_batch * KPerBlock "
+                                  "without padding!");
+                }
                 return false;
             }
             if(kargs.K % GemmPipeline::GetVectorSizeB() != 0)
             {
-                std::cerr << "K is not a multiple of vector load size for B tensor!" << std::endl;
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("K is not a multiple of vector load size for B tensor!");
+                }
                 return false;
             }
         }
@@ -243,14 +329,19 @@ struct GemmKernel
         {
             if(kargs.N % TilePartitioner::NPerBlock != 0 && GemmPipeline::kPadN == false)
             {
-                std::cerr << "Can't support N that is not a multiple of NPerBlock"
-                             " without padding!"
-                          << std::endl;
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR(
+                        "Can't support N that is not a multiple of NPerBlock without padding!");
+                }
                 return false;
             }
-            if(kargs.N % EpiloguePipeline::template GetVectorSizeC<CDataType>() != 0)
+            if(kargs.N % EpiloguePipeline::GetVectorSizeC() != 0)
             {
-                std::cerr << "N is not a multiple of vector load size for C tensor!" << std::endl;
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("N is not a multiple of vector load size for C tensor!");
+                }
                 return false;
             }
         }
@@ -258,14 +349,19 @@ struct GemmKernel
         {
             if(kargs.M % TilePartitioner::MPerBlock != 0 && GemmPipeline::kPadM == false)
             {
-                std::cerr << "Can't support M that is not a multiple of MPerBlock"
-                             " without padding!"
-                          << std::endl;
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR(
+                        "Can't support M that is not a multiple of MPerBlock without padding!");
+                }
                 return false;
             }
-            if(kargs.M % EpiloguePipeline::template GetVectorSizeC<CDataType>() != 0)
+            if(kargs.M % EpiloguePipeline::GetVectorSizeC() != 0)
             {
-                std::cerr << "M is not a multiple of vector load size for C tensor!" << std::endl;
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("M is not a multiple of vector load size for C tensor!");
+                }
                 return false;
             }
         }
@@ -279,6 +375,7 @@ struct GemmKernel
                                                    const GemmKernelArgs& kargs,
                                                    const SplitKBatchOffset& splitk_batch_offset)
     {
+        static_assert(!TilePartitioner::BlockGemmShape::PermuteA, "Not implemented!");
         const auto& a_tensor_view = [&]() {
             if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
             {
@@ -303,21 +400,63 @@ struct GemmKernel
         const auto& b_tensor_view = [&]() {
             if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
             {
-                return make_naive_tensor_view<address_space_enum::global>(
-                    b_ptr,
-                    make_tuple(splitk_batch_offset.splitted_k, kargs.N),
-                    make_tuple(kargs.stride_B, 1),
-                    number<GemmPipeline::GetVectorSizeB()>{},
-                    number<1>{});
+                if constexpr(TilePartitioner::BlockGemmShape::PermuteB)
+                {
+                    constexpr index_t K1          = GemmPipeline::GetSmemPackB();
+                    const index_t K0              = splitk_batch_offset.splitted_k / K1;
+                    constexpr index_t VectorSizeB = std::min(K1, GemmPipeline::GetVectorSizeB());
+                    const auto b_k0_n_k1_desc =
+                        make_naive_tensor_descriptor(make_tuple(K0, kargs.N, K1),
+                                                     make_tuple(kargs.N * K1, K1, I1),
+                                                     number<VectorSizeB>{},
+                                                     number<1>{});
+                    const auto b_n_k_desc = transform_tensor_descriptor(
+                        b_k0_n_k1_desc,
+                        make_tuple(make_merge_transform(make_tuple(K0, K1)),
+                                   make_pass_through_transform(kargs.N)),
+                        make_tuple(sequence<0, 2>{}, sequence<1>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                    return make_tensor_view<address_space_enum::global>(b_ptr, b_n_k_desc);
+                }
+                else
+                {
+                    return make_naive_tensor_view<address_space_enum::global>(
+                        b_ptr,
+                        make_tuple(splitk_batch_offset.splitted_k, kargs.N),
+                        make_tuple(kargs.stride_B, 1),
+                        number<GemmPipeline::GetVectorSizeB()>{},
+                        number<1>{});
+                }
             }
             else
             {
-                return make_naive_tensor_view<address_space_enum::global>(
-                    b_ptr,
-                    make_tuple(kargs.N, splitk_batch_offset.splitted_k),
-                    make_tuple(kargs.stride_B, 1),
-                    number<GemmPipeline::GetVectorSizeB()>{},
-                    number<1>{});
+                if constexpr(TilePartitioner::BlockGemmShape::PermuteB)
+                {
+                    constexpr index_t K1          = GemmPipeline::GetSmemPackB();
+                    const index_t K0              = splitk_batch_offset.splitted_k / K1;
+                    constexpr index_t VectorSizeB = std::min(K1, GemmPipeline::GetVectorSizeB());
+                    const auto b_k0_n_k1_desc =
+                        make_naive_tensor_descriptor(make_tuple(K0, kargs.N, K1),
+                                                     make_tuple(kargs.N * K1, K1, I1),
+                                                     number<VectorSizeB>{},
+                                                     number<1>{});
+                    const auto b_n_k_desc = transform_tensor_descriptor(
+                        b_k0_n_k1_desc,
+                        make_tuple(make_merge_transform(make_tuple(K0, K1)),
+                                   make_pass_through_transform(kargs.N)),
+                        make_tuple(sequence<0, 2>{}, sequence<1>{}),
+                        make_tuple(sequence<1>{}, sequence<0>{}));
+                    return make_tensor_view<address_space_enum::global>(b_ptr, b_n_k_desc);
+                }
+                else
+                {
+                    return make_naive_tensor_view<address_space_enum::global>(
+                        b_ptr,
+                        make_tuple(kargs.N, splitk_batch_offset.splitted_k),
+                        make_tuple(kargs.stride_B, 1),
+                        number<GemmPipeline::GetVectorSizeB()>{},
+                        number<1>{});
+                }
             }
         }();
 
@@ -329,7 +468,7 @@ struct GemmKernel
                     c_ptr,
                     make_tuple(kargs.M, kargs.N),
                     make_tuple(kargs.stride_C, 1),
-                    number<EpiloguePipeline::template GetVectorSizeC<CDataType>()>{},
+                    number<EpiloguePipeline::GetVectorSizeC()>{},
                     number<1>{});
             }
             else
@@ -488,7 +627,8 @@ struct GemmKernel
         const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
 
-        const index_t num_loop = TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k);
+        const index_t num_loop = __builtin_amdgcn_readfirstlane(
+            TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
 
         // Run GEMM cooperatively by whole workgroup.
         const auto& a_block_window = gemm_tile_windows.at(I0);
@@ -539,7 +679,8 @@ struct GemmKernel
         const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
 
-        const index_t num_loop = TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k);
+        const index_t num_loop = __builtin_amdgcn_readfirstlane(
+            TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
 
         // Run GEMM cooperatively by whole workgroup.
         const auto& a_block_window = gemm_tile_windows.at(I0);
@@ -558,7 +699,8 @@ struct GemmKernel
 
     CK_TILE_DEVICE void operator()(GemmKernelArgs kargs) const
     {
-        const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockIdx.x);
+        const auto blockId  = __builtin_amdgcn_readfirstlane(blockIdx.x);
+        const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockId);
         const index_t i_m   = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
         const index_t i_n   = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
 
@@ -572,11 +714,11 @@ struct GemmKernel
 
         // allocate LDS
         __shared__ char smem_ptr_0[GetSmemSize()];
-        __shared__ char smem_ptr_1[GetSmemSize()];
 
-        if(kargs.k_batch == 1)
+        if constexpr(GemmPipeline::DoubleSmemBuffer == true)
         {
-            if constexpr(GemmPipeline::DoubleSmemBuffer == true)
+            __shared__ char smem_ptr_1[GetSmemSize()];
+            if(kargs.k_batch == 1)
             {
                 RunGemm2LDS(a_ptr,
                             b_ptr,
@@ -590,17 +732,8 @@ struct GemmKernel
             }
             else
             {
-                RunGemm(a_ptr, b_ptr, c_ptr, smem_ptr_0, kargs, splitk_batch_offset, i_m, i_n);
-            }
-        }
-        else
-        {
-            // Do not compile in case where we have unsupported
-            // VectorSizeC & data type configuration.
-            if constexpr(!(EpiloguePipeline::template GetVectorSizeC<CDataType>() % 2 != 0 &&
-                           is_any_of<CDataType, fp16_t, bf16_t>::value))
-            {
-                if constexpr(GemmPipeline::DoubleSmemBuffer == true)
+                if constexpr(!(EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                               is_any_of<CDataType, fp16_t, bf16_t>::value))
                 {
                     RunGemm2LDS<memory_operation_enum::atomic_add>(a_ptr,
                                                                    b_ptr,
@@ -612,7 +745,18 @@ struct GemmKernel
                                                                    i_m,
                                                                    i_n);
                 }
-                else
+            }
+        }
+        else
+        {
+            if(kargs.k_batch == 1)
+            {
+                RunGemm(a_ptr, b_ptr, c_ptr, smem_ptr_0, kargs, splitk_batch_offset, i_m, i_n);
+            }
+            else
+            {
+                if constexpr(!(EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                               is_any_of<CDataType, fp16_t, bf16_t>::value))
                 {
                     RunGemm<memory_operation_enum::atomic_add>(
                         a_ptr, b_ptr, c_ptr, smem_ptr_0, kargs, splitk_batch_offset, i_m, i_n);
diff --git a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
index 751e7c0e1a..5577cb083a 100644
--- a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
@@ -11,24 +11,17 @@
 
 namespace ck_tile {
 
-struct GroupedGemmHostArgs : public ck_tile::GemmHostArgs
+struct GemmTransKernelArg
 {
-    CK_TILE_HOST GroupedGemmHostArgs() noexcept = default;
-    CK_TILE_HOST GroupedGemmHostArgs(const void* a_ptr_,
-                                     const void* b_ptr_,
-                                     void* c_ptr_,
-                                     ck_tile::index_t M_,
-                                     ck_tile::index_t N_,
-                                     ck_tile::index_t K_,
-                                     ck_tile::index_t stride_A_,
-                                     ck_tile::index_t stride_B_,
-                                     ck_tile::index_t stride_C_)
-        : GemmHostArgs(a_ptr_, b_ptr_, c_ptr_, KBatch, M_, N_, K_, stride_A_, stride_B_, stride_C_)
+    GemmKernelArgs group_karg;
+    ck_tile::index_t block_start;
+    ck_tile::index_t block_end;
+
+    GemmTransKernelArg() = default;
+    GemmTransKernelArg(GemmKernelArgs&& karg, index_t bl_start, index_t bl_end)
+        : group_karg{karg}, block_start{bl_start}, block_end{bl_end}
     {
     }
-
-    private:
-    static constexpr index_t KBatch = 1;
 };
 
 template <typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
@@ -47,36 +40,22 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
 
     using OffsetTile1DPartitioner = OffsettedTile1DPartitioner<TilePartitioner>;
     using Base                    = GemmKernel<TilePartitioner_, GemmPipeline_, EpiloguePipeline_>;
-    using GemmKernelArgs          = typename Base::GemmKernelArgs;
 
     static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize;
 
-    struct GemmTransKernelArg
-    {
-        GemmKernelArgs group_karg;
-        ck_tile::index_t block_start;
-        ck_tile::index_t block_end;
-
-        GemmTransKernelArg() = default;
-        GemmTransKernelArg(GemmKernelArgs&& karg, index_t bl_start, index_t bl_end)
-            : group_karg{karg}, block_start{bl_start}, block_end{bl_end}
-        {
-        }
-    };
-
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
         // clang-format off
         using P_ = GemmPipeline;
 
         return concat('_', "gemm_grouped", gemm_prec_str<ADataType, BDataType>,
-                      concat('x', P_::kMPerBlock, P_::kNPerBlock, P_::kKPerBlock),
+                      concat('x', P_::MPerBlock, P_::NPerBlock, P_::KPerBlock),
                       concat('x', P_::GetVectorSizeA(), P_::GetVectorSizeB(), P_::GetVectorSizeC()),
                       concat('x', P_::kPadM, P_::kPadN, P_::kPadK));
         // clang-format on
     }
 
-    __host__ static auto GetWorkSpaceSize(const std::vector<GroupedGemmHostArgs>& gemm_descs)
+    __host__ static auto GetWorkSpaceSize(const std::vector<GemmHostArgs>& gemm_descs)
         -> std::size_t
     {
         return gemm_descs.size() * sizeof(GemmTransKernelArg);
@@ -84,7 +63,7 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
 
     __host__ static constexpr auto BlockSize() -> dim3 { return dim3(KernelBlockSize); }
 
-    __host__ static constexpr auto GridSize(const std::vector<GroupedGemmHostArgs>& gemm_descs)
+    __host__ static constexpr auto GridSize(const std::vector<GemmHostArgs>& gemm_descs)
     {
         index_t grid_size = 0;
         for(const auto& it_desc : gemm_descs)
@@ -95,7 +74,7 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
         return dim3(grid_size, 1, 1);
     }
 
-    CK_TILE_HOST static auto MakeKargs(const std::vector<GroupedGemmHostArgs>& gemm_descs)
+    CK_TILE_HOST static auto MakeKargs(const std::vector<GemmHostArgs>& gemm_descs)
         -> std::vector<GemmTransKernelArg>
     {
         std::vector<GemmTransKernelArg> gemm_kernel_args_;
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
index 4855df0e0e..24bd66a59e 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
@@ -68,9 +68,10 @@ struct GemmPipelineAgBgCrImplBase
         return make_tuple(std::move(a_lds_block), std::move(b_lds_block));
     }
 
-    template <typename ADramBlockWindowTmp, typename ALdsTensorView>
-    CK_TILE_DEVICE auto GetAWindows(const ADramBlockWindowTmp& a_dram_block_window_tmp,
-                                    const ALdsTensorView& a_lds_block_view) const
+    template <typename ADramBlockWindowTmp, typename ALdsTensorView, typename ALdsLoadTileDistr>
+    CK_TILE_DEVICE constexpr auto GetAWindows(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                              const ALdsTensorView& a_lds_block_view,
+                                              const ALdsLoadTileDistr&) const
     {
         constexpr bool is_col_major = std::is_same_v<ALayout, tensor_layout::gemm::ColumnMajor>;
 
@@ -88,17 +89,21 @@ struct GemmPipelineAgBgCrImplBase
         auto a_copy_lds_window = make_tile_window(
             a_lds_block_view, make_tuple(number<MPerBlock>{}, number<KPerBlock>{}), {0, 0});
 
-        auto a_lds_gemm_window = make_tile_window(
-            a_lds_block_view, make_tuple(number<MPerBlock>{}, number<KPerBlock>{}), {0, 0});
+        auto a_lds_gemm_window =
+            make_tile_window(a_lds_block_view,
+                             make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
+                             {0, 0},
+                             ALdsLoadTileDistr{});
 
         return make_tuple(std::move(a_copy_dram_window),
                           std::move(a_copy_lds_window),
                           std::move(a_lds_gemm_window));
     }
 
-    template <typename BDramBlockWindowTmp, typename BLdsTensorView>
-    CK_TILE_DEVICE auto GetBWindows(const BDramBlockWindowTmp& b_dram_block_window_tmp,
-                                    const BLdsTensorView& b_lds_block_view) const
+    template <typename BDramBlockWindowTmp, typename BLdsTensorView, typename BLdsLoadTileDistr>
+    CK_TILE_DEVICE constexpr auto GetBWindows(const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                              const BLdsTensorView& b_lds_block_view,
+                                              const BLdsLoadTileDistr&) const
     {
         constexpr bool is_row_major = std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>;
 
@@ -117,8 +122,11 @@ struct GemmPipelineAgBgCrImplBase
         auto b_copy_lds_window = make_tile_window(
             b_lds_block_view, make_tuple(number<NPerBlock>{}, number<KPerBlock>{}), {0, 0});
 
-        auto b_lds_gemm_window = make_tile_window(
-            b_lds_block_view, make_tuple(number<NPerBlock>{}, number<KPerBlock>{}), {0, 0});
+        auto b_lds_gemm_window =
+            make_tile_window(b_lds_block_view,
+                             make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
+                             {0, 0},
+                             BLdsLoadTileDistr{});
 
         return make_tuple(std::move(b_copy_dram_window),
                           std::move(b_copy_lds_window),
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
index 73d5ce8f81..c198c9443a 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
@@ -33,8 +33,21 @@ struct BaseGemmPipelineAgBgCrCompV3
 
     CK_TILE_HOST static constexpr TailNumber GetBlockLoopTailNum(index_t num_loop)
     {
-        ignore = num_loop;
-        return TailNumber::Full;
+        if(BlockHasHotloop(num_loop))
+        {
+            return TailNumber::Full;
+        }
+        else
+        {
+            if(num_loop == 1)
+            {
+                return TailNumber::Odd;
+            }
+            else
+            {
+                return TailNumber::Even;
+            }
+        }
     }
 };
 
@@ -77,6 +90,9 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
     static constexpr index_t GetVectorSizeB() { return Policy::template GetVectorSizeB<Problem>(); }
     static constexpr index_t GetVectorSizeC() { return Policy::template GetVectorSizeC<Problem>(); }
 
+    static constexpr index_t GetSmemPackA() { return Policy::template GetSmemPackA<Problem>(); }
+    static constexpr index_t GetSmemPackB() { return Policy::template GetSmemPackB<Problem>(); }
+
     static constexpr bool kPadM = Problem::kPadM;
     static constexpr bool kPadN = Problem::kPadN;
     static constexpr bool kPadK = Problem::kPadK;
@@ -114,11 +130,11 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
         constexpr index_t WaveNumN = BlockGemmShape::BlockWarps::at(I1{});
 
         // Below should be equal to AK1|BK1
-        constexpr index_t A_LDS_Read_Width = Policy::template GetSmemPackA<Problem>();
-        constexpr index_t B_LDS_Read_Width = Policy::template GetSmemPackB<Problem>();
+        constexpr index_t A_LDS_Read_Width = GetSmemPackA();
+        constexpr index_t B_LDS_Read_Width = GetSmemPackB();
 
-        constexpr index_t A_LDS_Write_Width = Policy::template GetSmemPackA<Problem>();
-        constexpr index_t B_LDS_Write_Width = Policy::template GetSmemPackB<Problem>();
+        constexpr index_t A_LDS_Write_Width = GetSmemPackA();
+        constexpr index_t B_LDS_Write_Width = GetSmemPackB();
 
         constexpr index_t A_Buffer_Load_Inst_Num =
             MPerBlock * KPerBlock / (BlockSize * GetVectorSizeA());
@@ -133,7 +149,7 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
         constexpr index_t A_LDS_Read_Inst_Num =
             WaveNumN * MPerBlock * KPerBlock / (BlockSize * A_LDS_Read_Width);
         constexpr index_t B_LDS_Read_Inst_Num =
-            WaveNumM * MPerBlock * KPerBlock / (BlockSize * B_LDS_Read_Width);
+            WaveNumM * NPerBlock * KPerBlock / (BlockSize * B_LDS_Read_Width);
 
         constexpr index_t C_MFMA_Inst_Num = MPerBlock * NPerBlock * KPerBlock /
                                             (BlockSize / WaveSize) / (MPerXDL * NPerXDL * KPerXDL);
@@ -174,11 +190,11 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
             constexpr index_t WaveNumN = BlockGemmShape::BlockWarps::at(I1{});
 
             // Below should be equal to AK1|BK1
-            constexpr index_t A_LDS_Read_Width = Policy::template GetSmemPackA<Problem>();
-            constexpr index_t B_LDS_Read_Width = Policy::template GetSmemPackB<Problem>();
+            constexpr index_t A_LDS_Read_Width = GetSmemPackA();
+            constexpr index_t B_LDS_Read_Width = GetSmemPackB();
 
-            constexpr index_t A_LDS_Write_Width = Policy::template GetSmemPackA<Problem>();
-            constexpr index_t B_LDS_Write_Width = Policy::template GetSmemPackB<Problem>();
+            constexpr index_t A_LDS_Write_Width = GetSmemPackA();
+            constexpr index_t B_LDS_Write_Width = GetSmemPackB();
 
             constexpr index_t A_Buffer_Load_Inst_Num =
                 MPerBlock * KPerBlock / (BlockSize * GetVectorSizeA());
@@ -193,7 +209,7 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
             constexpr index_t A_LDS_Read_Inst_Num =
                 WaveNumN * MPerBlock * KPerBlock / (BlockSize * A_LDS_Read_Width);
             constexpr index_t B_LDS_Read_Inst_Num =
-                WaveNumM * MPerBlock * KPerBlock / (BlockSize * B_LDS_Read_Width);
+                WaveNumM * NPerBlock * KPerBlock / (BlockSize * B_LDS_Read_Width);
 
             constexpr index_t C_MFMA_Inst_Num = MPerBlock * NPerBlock * KPerBlock /
                                                 (BlockSize / WaveSize) /
@@ -346,17 +362,23 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
             // A/B tiles in LDS
             auto&& [a_lds_block, b_lds_block] = Base::GetABLdsTensorViews(p_smem);
 
+            // Tile distribution for load from lds
+            constexpr auto a_lds_load_tile_distr =
+                make_static_tile_distribution(BlockGemm::MakeABlockDistributionEncode());
+            constexpr auto b_lds_load_tile_distr =
+                make_static_tile_distribution(BlockGemm::MakeBBlockDistributionEncode());
+
             // A DRAM tile window for load
             // A LDS tile window for store
             // A LDS tile for block GEMM
             auto&& [a_copy_dram_window, a_copy_lds_window, a_lds_gemm_window] =
-                Base::GetAWindows(a_dram_block_window_tmp, a_lds_block);
+                Base::GetAWindows(a_dram_block_window_tmp, a_lds_block, a_lds_load_tile_distr);
 
             // B DRAM tile window for load
             // B LDS tile window for store
             // B LDS tile for block GEMM
             auto&& [b_copy_dram_window, b_copy_lds_window, b_lds_gemm_window] =
-                Base::GetBWindows(b_dram_block_window_tmp, b_lds_block);
+                Base::GetBWindows(b_dram_block_window_tmp, b_lds_block, b_lds_load_tile_distr);
 
             // Block GEMM
             auto block_gemm   = BlockGemm();
@@ -461,6 +483,7 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
                     block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
 
                     block_sync_lds();
+
                     block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window);
                     HotLoopScheduler();
                     __builtin_amdgcn_sched_barrier(0);
@@ -469,12 +492,43 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
                 } while(i < (num_loop - 1));
             }
             // tail
-            if constexpr(TailNum == TailNumber::Full)
+            if constexpr((TailNum == TailNumber::Full) || (TailNum == TailNumber::Odd))
             {
+                // Leak last MFMA block to epilogue region, cover the potential lds-shuffle
+                // latency
+                block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
+            }
+            else
+            {
+                block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
+                block_sync_lds();
+
+                if constexpr(is_a_col_major)
+                {
+                    auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
+                        Policy::template MakeShuffledARegTileDistribution<Problem>());
+                    transpose_tile2d(a_shuffle_tmp, a_block_tile);
+                    Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp, a_element_func);
+                }
+                else
+                {
+                    Base::LocalPrefill(a_copy_lds_window, a_block_tile, a_element_func);
+                }
+                if constexpr(is_b_row_major)
+                {
+                    auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
+                        Policy::template MakeShuffledBRegTileDistribution<Problem>());
+                    transpose_tile2d(b_shuffle_tmp, b_block_tile);
+                    Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp, b_element_func);
+                }
+                else
+                {
+                    Base::LocalPrefill(b_copy_lds_window, b_block_tile, b_element_func);
+                }
+                block_sync_lds();
+                block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window);
                 block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
             }
-            // Let's leak last MFMA block to epilogue region, cover the potential lds-shuffle
-            // latency
             // __builtin_amdgcn_sched_barrier(0);
             return c_block_tile;
         }
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
index b679f8c8aa..667bb80ce9 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
@@ -86,6 +86,9 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
     static constexpr index_t GetVectorSizeB() { return Policy::template GetVectorSizeB<Problem>(); }
     static constexpr index_t GetVectorSizeC() { return Policy::template GetVectorSizeC<Problem>(); }
 
+    static constexpr index_t GetSmemPackA() { return Policy::template GetSmemPackA<Problem>(); }
+    static constexpr index_t GetSmemPackB() { return Policy::template GetSmemPackB<Problem>(); }
+
     static constexpr bool kPadM = Problem::kPadM;
     static constexpr bool kPadN = Problem::kPadN;
     static constexpr bool kPadK = Problem::kPadK;
@@ -140,7 +143,7 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
             constexpr index_t A_LDS_Read_Inst_Num =
                 WaveNumN * MPerBlock * KPerBlock / (BlockSize * KPerXDL);
             constexpr index_t B_LDS_Read_Inst_Num =
-                WaveNumM * MPerBlock * KPerBlock / (BlockSize * KPerXDL);
+                WaveNumM * NPerBlock * KPerBlock / (BlockSize * KPerXDL);
 
             constexpr index_t C_MFMA_Inst_Num = MPerBlock * NPerBlock * KPerBlock /
                                                 (BlockSize / WaveSize) /
@@ -214,17 +217,17 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
             ////////////// global window & register /////////////////
             // A DRAM tile window for load
             auto a_copy_dram_window =
-                make_tile_window_linear(a_dram_block_window_tmp.get_bottom_tensor_view(),
-                                        make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
-                                        a_dram_block_window_tmp.get_window_origin(),
-                                        Policy::template MakeADramTileDistribution<Problem>());
+                make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(),
+                                 make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
+                                 a_dram_block_window_tmp.get_window_origin(),
+                                 Policy::template MakeADramTileDistribution<Problem>());
 
             // B DRAM tile window for load
             auto b_copy_dram_window =
-                make_tile_window_linear(b_dram_block_window_tmp.get_bottom_tensor_view(),
-                                        make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
-                                        b_dram_block_window_tmp.get_window_origin(),
-                                        Policy::template MakeBDramTileDistribution<Problem>());
+                make_tile_window(b_dram_block_window_tmp.get_bottom_tensor_view(),
+                                 make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
+                                 b_dram_block_window_tmp.get_window_origin(),
+                                 Policy::template MakeBDramTileDistribution<Problem>());
 
             // A register tile for global load
             constexpr auto ABlockTileDistr = a_copy_dram_window.get_tile_distribution();
@@ -314,25 +317,25 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
             BLdsTile b_block_tile1;
 
             auto a_lds_ld_window0 =
-                make_tile_window_linear(a_lds_block0,
-                                        make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
-                                        {0, 0},
-                                        ALdsTileDistr);
+                make_tile_window(a_lds_block0,
+                                 make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
+                                 {0, 0},
+                                 ALdsTileDistr);
             auto a_lds_ld_window1 =
-                make_tile_window_linear(a_lds_block1,
-                                        make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
-                                        {0, 0},
-                                        ALdsTileDistr);
+                make_tile_window(a_lds_block1,
+                                 make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
+                                 {0, 0},
+                                 ALdsTileDistr);
             auto b_lds_ld_window0 =
-                make_tile_window_linear(b_lds_block0,
-                                        make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
-                                        {0, 0},
-                                        BLdsTileDistr);
+                make_tile_window(b_lds_block0,
+                                 make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
+                                 {0, 0},
+                                 BLdsTileDistr);
             auto b_lds_ld_window1 =
-                make_tile_window_linear(b_lds_block1,
-                                        make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
-                                        {0, 0},
-                                        BLdsTileDistr);
+                make_tile_window(b_lds_block1,
+                                 make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
+                                 {0, 0},
+                                 BLdsTileDistr);
 
             Base::LocalPrefetch(a_block_tile0, a_lds_ld_window0);
             Base::LocalPrefetch(b_block_tile0, b_lds_ld_window0);
@@ -439,6 +442,7 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
                             Base::LocalPrefill(
                                 b_copy_lds_window1, b_global_load_tile, b_element_func);
                         }
+                        block_sync_lds();
 
                         Base::GlobalPrefetch(
                             a_global_load_tile, a_copy_dram_window, a_dram_tile_window_step);
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp
index e528847438..f6920f1c57 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp
@@ -17,56 +17,6 @@ namespace ck_tile {
 struct GemmPipelineAgBgCrCompV4DefaultPolicy
     : public UniversalGemmBasePolicy<GemmPipelineAgBgCrCompV4DefaultPolicy>
 {
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
-    {
-        using namespace ck_tile;
-
-        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
-        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
-        constexpr index_t KPack      = GetSmemPackA<Problem>();
-
-        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
-            make_tuple(number<kKPerBlock / KPack>{}, number<kMPerBlock>{}, number<KPack>{}),
-            make_tuple(number<kMPerBlock * KPack>{}, number<KPack>{}, number<1>{}),
-            number<KPack>{},
-            number<1>{});
-
-        constexpr auto a_lds_block_desc = transform_tensor_descriptor(
-            a_lds_block_desc_0,
-            make_tuple(
-                make_pass_through_transform(number<kMPerBlock>{}),
-                make_merge_transform(make_tuple(number<kKPerBlock>{} / KPack, number<KPack>{}))),
-            make_tuple(sequence<1>{}, sequence<0, 2>{}),
-            make_tuple(sequence<0>{}, sequence<1>{}));
-
-        return a_lds_block_desc;
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeBLdsBlockDescriptor()
-    {
-        constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN;
-        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
-        constexpr index_t KPack      = GetSmemPackB<Problem>();
-
-        constexpr auto b_lds_block_desc_0 = make_naive_tensor_descriptor(
-            make_tuple(number<kKPerBlock / KPack>{}, number<kNPerBlock>{}, number<KPack>{}),
-            make_tuple(number<(kNPerBlock)*KPack>{}, number<KPack>{}, number<1>{}),
-            number<KPack>{},
-            number<1>{});
-
-        constexpr auto b_lds_block_desc = transform_tensor_descriptor(
-            b_lds_block_desc_0,
-            make_tuple(
-                make_pass_through_transform(number<kNPerBlock>{}),
-                make_merge_transform(make_tuple(number<kKPerBlock / KPack>{}, number<KPack>{}))),
-            make_tuple(sequence<1>{}, sequence<0, 2>{}),
-            make_tuple(sequence<0>{}, sequence<1>{}));
-
-        return b_lds_block_desc;
-    }
-
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
     {
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
index b8b2d5b1c9..abf5b617ee 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
@@ -129,6 +129,9 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
     static constexpr index_t GetVectorSizeB() { return Policy::template GetVectorSizeB<Problem>(); }
     static constexpr index_t GetVectorSizeC() { return Policy::template GetVectorSizeC<Problem>(); }
 
+    static constexpr index_t GetSmemPackA() { return Policy::template GetSmemPackA<Problem>(); }
+    static constexpr index_t GetSmemPackB() { return Policy::template GetSmemPackB<Problem>(); }
+
     static constexpr bool kPadM = Problem::kPadM;
     static constexpr bool kPadN = Problem::kPadN;
     static constexpr bool kPadK = Problem::kPadK;
@@ -215,10 +218,17 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
             auto& a_lds_block  = ab_lds_blocks.at(I0{});
             auto& b_lds_block  = ab_lds_blocks.at(I1{});
 
+            // Tile distribution for load from lds
+            constexpr auto a_lds_load_tile_distr = decltype(make_static_tile_distribution(
+                BlockGemm::MakeABlockDistributionEncode())){};
+            constexpr auto b_lds_load_tile_distr = decltype(make_static_tile_distribution(
+                BlockGemm::MakeBBlockDistributionEncode())){};
+
             // A DRAM tile window for load
             // A LDS tile window for store
             // A LDS tile for block GEMM
-            auto a_windows           = Base::GetAWindows(a_dram_block_window_tmp, a_lds_block);
+            auto a_windows =
+                Base::GetAWindows(a_dram_block_window_tmp, a_lds_block, a_lds_load_tile_distr);
             auto& a_copy_dram_window = a_windows.at(I0{});
             auto& a_copy_lds_window  = a_windows.at(I1{});
             auto& a_lds_gemm_window  = a_windows.at(I2{});
@@ -226,7 +236,8 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
             // B DRAM tile window for load
             // B LDS tile window for store
             // B LDS tile for block GEMM
-            auto b_windows           = Base::GetBWindows(b_dram_block_window_tmp, b_lds_block);
+            auto b_windows =
+                Base::GetBWindows(b_dram_block_window_tmp, b_lds_block, b_lds_load_tile_distr);
             auto& b_copy_dram_window = b_windows.at(I0{});
             auto& b_copy_lds_window  = b_windows.at(I1{});
             auto& b_lds_gemm_window  = b_windows.at(I2{});
@@ -493,10 +504,17 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
             auto& a_lds_block  = ab_lds_blocks.at(I0{});
             auto& b_lds_block  = ab_lds_blocks.at(I1{});
 
+            // Tile distribution for load from lds
+            constexpr auto a_lds_load_tile_distr = decltype(make_static_tile_distribution(
+                BlockGemm::MakeABlockDistributionEncode())){};
+            constexpr auto b_lds_load_tile_distr = decltype(make_static_tile_distribution(
+                BlockGemm::MakeBBlockDistributionEncode())){};
+
             // A DRAM tile window for load
             // A LDS tile window for store
             // A LDS tile for block GEMM
-            auto a_windows           = Base::GetAWindows(a_dram_block_window_tmp, a_lds_block);
+            auto a_windows =
+                Base::GetAWindows(a_dram_block_window_tmp, a_lds_block, a_lds_load_tile_distr);
             auto& a_copy_dram_window = a_windows.at(I0{});
             auto& a_copy_lds_window  = a_windows.at(I1{});
             auto& a_lds_gemm_window  = a_windows.at(I2{});
@@ -504,7 +522,8 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
             // B DRAM tile window for load
             // B LDS tile window for store
             // B LDS tile for block GEMM
-            auto b_windows           = Base::GetBWindows(b_dram_block_window_tmp, b_lds_block);
+            auto b_windows =
+                Base::GetBWindows(b_dram_block_window_tmp, b_lds_block, b_lds_load_tile_distr);
             auto& b_copy_dram_window = b_windows.at(I0{});
             auto& b_copy_lds_window  = b_windows.at(I1{});
             auto& b_lds_gemm_window  = b_windows.at(I2{});
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
index 33945651ae..217408fffa 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -12,7 +12,7 @@ namespace ck_tile {
 //  A Tile Window: global memory
 //  B Tile Window: global memory
 //  C Distributed tensor: register
-template <typename Problem, typename Policy = GemmPipelineAGmemBGmemCRegV1DefaultPolicy>
+template <typename Problem, typename Policy = UniversalGemmPipelineAgBgCrPolicy>
 struct GemmPipelineAGmemBGmemCRegV1
 {
     using ADataType      = remove_cvref_t<typename Problem::ADataType>;
@@ -26,6 +26,10 @@ struct GemmPipelineAGmemBGmemCRegV1
 
     using BlockGemm = remove_cvref_t<decltype(Policy::template GetBlockGemm<Problem>())>;
 
+    using I0 = number<0>;
+    using I1 = number<1>;
+    using I2 = number<2>;
+
     static constexpr index_t BlockSize = Problem::kBlockSize;
 
     static constexpr index_t kMPerBlock = BlockGemmShape::kM;
@@ -36,6 +40,9 @@ struct GemmPipelineAGmemBGmemCRegV1
     static constexpr index_t GetVectorSizeB() { return Problem::VectorSizeB; }
     static constexpr index_t GetVectorSizeC() { return Problem::VectorSizeC; }
 
+    static constexpr index_t GetSmemPackA() { return Policy::template GetSmemPackA<Problem>(); }
+    static constexpr index_t GetSmemPackB() { return Policy::template GetSmemPackB<Problem>(); }
+
     static constexpr bool kPadM = Problem::kPadM;
     static constexpr bool kPadN = Problem::kPadN;
     static constexpr bool kPadK = Problem::kPadK;
@@ -78,11 +85,21 @@ struct GemmPipelineAGmemBGmemCRegV1
                 std::is_same_v<BDataType, remove_cvref_t<typename BDramBlockWindowTmp::DataType>>,
             "wrong!");
 
-        static_assert(kMPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
-                          kNPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
-                          kKPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<1>{}],
-                      "wrong!");
+        constexpr bool is_a_col_major = std::is_same_v<ALayout, tensor_layout::gemm::ColumnMajor>;
+        constexpr bool is_b_row_major = std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>;
 
+        static_assert(is_a_col_major
+                          ? (kKPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
+                             kMPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I1{}])
+                          : (kMPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
+                             kKPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I1{}]),
+                      "A block window has incorrect lengths for defined ALayout!");
+        static_assert(is_b_row_major
+                          ? (kKPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
+                             kNPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I1{}])
+                          : (kNPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
+                             kKPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I1{}]),
+                      "B block window has incorrect lengths for defined BLayout!");
         // A tile in LDS
         ADataType* p_a_lds = static_cast<ADataType*>(p_smem);
 
@@ -125,13 +142,25 @@ struct GemmPipelineAGmemBGmemCRegV1
         auto b_copy_lds_window = make_tile_window(
             b_lds_block, make_tuple(number<kNPerBlock>{}, number<kKPerBlock>{}), {0, 0});
 
+        // Tile distribution for load from lds
+        constexpr auto a_lds_load_tile_distr =
+            make_static_tile_distribution(BlockGemm::MakeABlockDistributionEncode());
+        constexpr auto b_lds_load_tile_distr =
+            make_static_tile_distribution(BlockGemm::MakeBBlockDistributionEncode());
+
         // A LDS tile for block GEMM
-        auto a_lds_gemm_window = make_tile_window(
-            a_lds_block, make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}), {0, 0});
+        auto a_lds_gemm_window =
+            make_tile_window(a_lds_block,
+                             make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}),
+                             {0, 0},
+                             a_lds_load_tile_distr);
 
         // B LDS tile for block GEMM
-        auto b_lds_gemm_window = make_tile_window(
-            b_lds_block, make_tuple(number<kNPerBlock>{}, number<kKPerBlock>{}), {0, 0});
+        auto b_lds_gemm_window =
+            make_tile_window(b_lds_block,
+                             make_tuple(number<kNPerBlock>{}, number<kKPerBlock>{}),
+                             {0, 0},
+                             b_lds_load_tile_distr);
 
         // Block GEMM
         auto block_gemm = BlockGemm();
@@ -153,11 +182,11 @@ struct GemmPipelineAGmemBGmemCRegV1
             tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
 
             // LDS write 0
-            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::ColumnMajor>)
+            if constexpr(is_a_col_major)
             {
                 auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
-                    Policy::template MakeShuffledARegBlockDistribution<Problem>());
-                shuffle_tile(a_shuffle_tmp, a_block_tile);
+                    Policy::template MakeShuffledARegTileDistribution<Problem>());
+                transpose_tile2d(a_shuffle_tmp, a_block_tile);
                 const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_shuffle_tmp);
                 store_tile(a_copy_lds_window, a_block_tile_tmp);
             }
@@ -167,11 +196,11 @@ struct GemmPipelineAGmemBGmemCRegV1
             }
 
             // LDS write 0
-            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
+            if constexpr(is_b_row_major)
             {
                 auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
-                    Policy::template MakeShuffledBRegBlockDistribution<Problem>());
-                shuffle_tile(b_shuffle_tmp, b_block_tile);
+                    Policy::template MakeShuffledBRegTileDistribution<Problem>());
+                transpose_tile2d(b_shuffle_tmp, b_block_tile);
                 const auto b_block_tile_tmp = tile_elementwise_in(b_element_func, b_shuffle_tmp);
                 store_tile(b_copy_lds_window, b_block_tile_tmp);
             }
@@ -200,15 +229,26 @@ struct GemmPipelineAGmemBGmemCRegV1
             move_tile_window(b_copy_dram_window, {0, kKPerBlock});
 
             // LDS write i + 1
-            const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
-            store_tile(a_copy_lds_window, a_block_tile_tmp);
+            if constexpr(is_a_col_major)
+            {
+                auto a_shuffle_tmp_loop = make_static_distributed_tensor<ADataType>(
+                    Policy::template MakeShuffledARegTileDistribution<Problem>());
+                transpose_tile2d(a_shuffle_tmp_loop, a_block_tile);
+                store_tile(a_copy_lds_window,
+                           tile_elementwise_in(a_element_func, a_shuffle_tmp_loop));
+            }
+            else
+            {
+                const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
+                store_tile(a_copy_lds_window, a_block_tile_tmp);
+            }
 
             // LDS write i + 1
-            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
+            if constexpr(is_b_row_major)
             {
                 auto b_shuffle_tmp_loop = make_static_distributed_tensor<BDataType>(
-                    Policy::template MakeShuffledBRegBlockDistribution<Problem>());
-                shuffle_tile(b_shuffle_tmp_loop, b_block_tile);
+                    Policy::template MakeShuffledBRegTileDistribution<Problem>());
+                transpose_tile2d(b_shuffle_tmp_loop, b_block_tile);
                 store_tile(b_copy_lds_window,
                            tile_elementwise_in(b_element_func, b_shuffle_tmp_loop));
             }
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
old mode 100644
new mode 100755
index c7115c8eb4..6bb14af9e6
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
@@ -129,7 +129,7 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
             constexpr index_t KPack = GetSmemPackA<Problem>();
             static_assert(KPack % K3 == 0);
             constexpr index_t K2 = KPack / K3;
-            if constexpr(get_warp_size() % (K2 * M0))
+            if constexpr(get_warp_size() >= (K2 * M0))
             {
                 constexpr index_t K1 = get_warp_size() / (K2 * M0);
                 constexpr index_t K0 = BlockSize / get_warp_size();
@@ -219,7 +219,7 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
             constexpr index_t KPack = GetSmemPackB<Problem>();
             static_assert(KPack % K3 == 0);
             constexpr index_t K2 = KPack / K3;
-            if constexpr(get_warp_size() % (K2 * N0) == 0)
+            if constexpr(get_warp_size() >= (K2 * N0))
             {
                 constexpr index_t K1 = get_warp_size() / (K2 * N0);
                 constexpr index_t K0 = BlockSize / get_warp_size();
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
index fe706113ae..95b7618b11 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
@@ -31,6 +31,9 @@ struct GemmPipelineAGmemBGmemCRegV2
     static constexpr index_t kNPerBlock = BlockGemmShape::kN;
     static constexpr index_t kKPerBlock = BlockGemmShape::kK;
 
+    static constexpr index_t GetSmemPackA() { return Policy::template GetSmemPackA<Problem>(); }
+    static constexpr index_t GetSmemPackB() { return Policy::template GetSmemPackB<Problem>(); }
+
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
         // clang-format off
@@ -122,17 +125,29 @@ struct GemmPipelineAGmemBGmemCRegV2
                              {0, 0},
                              b_copy_dram_window.get_tile_distribution());
 
-        // A LDS tile for block GEMM
-        auto a_lds_gemm_window = make_tile_window(
-            a_lds_block, make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}), {0, 0});
-
-        // B LDS tile for block GEMM
-        auto b_lds_gemm_window = make_tile_window(
-            b_lds_block, make_tuple(number<kNPerBlock>{}, number<kKPerBlock>{}), {0, 0});
-
         // Block GEMM
         constexpr auto block_gemm = Policy::template GetBlockGemm<Problem>();
 
+        // Tile distribution for load from lds
+        constexpr auto a_lds_load_tile_distr =
+            make_static_tile_distribution(decltype(block_gemm)::MakeABlockDistributionEncode());
+        constexpr auto b_lds_load_tile_distr =
+            make_static_tile_distribution(decltype(block_gemm)::MakeBBlockDistributionEncode());
+
+        // A LDS tile for block GEMM
+        auto a_lds_gemm_window =
+            make_tile_window(a_lds_block,
+                             make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}),
+                             {0, 0},
+                             a_lds_load_tile_distr);
+
+        // B LDS tile for block GEMM
+        auto b_lds_gemm_window =
+            make_tile_window(b_lds_block,
+                             make_tuple(number<kNPerBlock>{}, number<kKPerBlock>{}),
+                             {0, 0},
+                             b_lds_load_tile_distr);
+
         // Acc register tile
         auto c_block_tile = decltype(block_gemm(a_lds_gemm_window, b_lds_gemm_window)){};
 
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
index f833ccc849..0b38e7789e 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
@@ -32,6 +32,8 @@ struct GemmPipelineProblemBase
 
     static constexpr bool TransposeC = Traits::TransposeC;
 
+    static constexpr bool UseStructuredSparsity = Traits::UseStructuredSparsity;
+
     static constexpr index_t kBlockSize = BlockGemmShape::NumWarps * get_warp_size();
 
     static constexpr bool kPadM = Traits::kPadM;
@@ -194,7 +196,8 @@ struct UniversalGemmPipelineProblem
     static constexpr auto HasHotLoop = HasHotLoop_;
     static constexpr auto TailNum    = TailNum_;
 
-    static constexpr bool TransposeC = Traits::TransposeC;
+    static constexpr bool TransposeC            = Traits::TransposeC;
+    static constexpr bool UseStructuredSparsity = Traits::UseStructuredSparsity;
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
index fd1e76a02b..6890cf2f64 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
@@ -19,6 +19,245 @@ struct UniversalGemmBasePolicy
     static constexpr auto ATileAccessPattern = tile_distribution_pattern::thread_raked;
     static constexpr auto BTileAccessPattern = tile_distribution_pattern::thread_raked;
 
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
+    {
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+
+        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+        constexpr index_t KPack     = GetSmemPackA<Problem>();
+
+        constexpr auto DataTypeSize = sizeof(ADataType);
+        constexpr auto MLdsLayer =
+            (32 * 4 / KPerBlock / DataTypeSize) < 1 ? 1 : (32 * 4 / KPerBlock / DataTypeSize);
+
+        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<KPerBlock / KPack * MLdsLayer>{},
+                       number<MPerBlock / MLdsLayer>{},
+                       number<KPack>{}),
+            make_tuple(number<KPack>{}, number<KPerBlock * MLdsLayer>{}, number<1>{}),
+            number<KPack>{},
+            number<1>{});
+
+        constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+            a_lds_block_desc_0,
+            make_tuple(make_xor_transform(make_tuple(number<MPerBlock / MLdsLayer>{},
+                                                     number<KPerBlock / KPack * MLdsLayer>{})),
+                       make_pass_through_transform(number<KPack>{})),
+            make_tuple(sequence<1, 0>{}, sequence<2>{}),
+            make_tuple(sequence<1, 0>{}, sequence<2>{}));
+
+        constexpr auto a_lds_block_desc_xk0_mnldslayer_mn_xk1 = transform_tensor_descriptor(
+            a_lds_block_desc_permuted,
+            make_tuple(make_unmerge_transform(
+                           make_tuple(number<MLdsLayer>{}, number<KPerBlock / KPack>{})),
+                       make_pass_through_transform(number<MPerBlock / MLdsLayer>{}),
+                       make_pass_through_transform(number<KPack>{})),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+            make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
+
+        constexpr auto a_lds_block_desc = transform_tensor_descriptor(
+            a_lds_block_desc_xk0_mnldslayer_mn_xk1,
+            make_tuple(make_merge_transform_v3_division_mod(
+                           make_tuple(number<MPerBlock / MLdsLayer>{}, number<MLdsLayer>{})),
+                       make_merge_transform_v3_division_mod(
+                           make_tuple(number<KPerBlock / KPack>{}, number<KPack>{}))),
+            make_tuple(sequence<1, 0>{}, sequence<2, 3>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return a_lds_block_desc;
+    }
+
+    /**
+     * @brief Create LDS block descriptor for B tensor.
+     *
+     * @tparam Problem  Gemm pipeline problem.
+     * @return B tensor LDS block descriptor.
+     */
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBLdsBlockDescriptor()
+    {
+        // using BLayout   = remove_cvref_t<typename Problem::BLayout>;
+        using BDataType = remove_cvref_t<typename Problem::BDataType>;
+
+        constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+
+#if 1
+        // if constexpr(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::ColumnMajor>)
+        {
+            constexpr index_t KPack     = GetSmemPackB<Problem>();
+            constexpr auto BK0          = number<KPerBlock / KPack>{};
+            constexpr auto DataTypeSize = sizeof(BDataType);
+            constexpr auto NLdsLayer =
+                (32 * 4 / KPerBlock / DataTypeSize) < 1 ? 1 : (32 * 4 / KPerBlock / DataTypeSize);
+
+            constexpr auto b_lds_block_desc_0 = make_naive_tensor_descriptor(
+                make_tuple(
+                    BK0 * number<NLdsLayer>{}, number<NPerBlock / NLdsLayer>{}, number<KPack>{}),
+                make_tuple(number<KPack>{}, number<KPerBlock * NLdsLayer>{}, number<1>{}),
+                number<KPack>{},
+                number<1>{});
+
+            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
+                b_lds_block_desc_0,
+                make_tuple(make_xor_transform(make_tuple(number<NPerBlock / NLdsLayer>{},
+                                                         BK0 * number<NLdsLayer>{})),
+                           make_pass_through_transform(number<KPack>{})),
+                make_tuple(sequence<1, 0>{}, sequence<2>{}),
+                make_tuple(sequence<1, 0>{}, sequence<2>{}));
+
+            constexpr auto b_lds_block_desc_bk0_nldslayer_n_bk1 = transform_tensor_descriptor(
+                b_lds_block_desc_permuted,
+                make_tuple(make_unmerge_transform(make_tuple(number<NLdsLayer>{}, BK0)),
+                           make_pass_through_transform(number<NPerBlock / NLdsLayer>{}),
+                           make_pass_through_transform(number<KPack>{})),
+                make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
+
+            constexpr auto b_lds_block_desc = transform_tensor_descriptor(
+                b_lds_block_desc_bk0_nldslayer_n_bk1,
+                make_tuple(make_merge_transform_v3_division_mod(
+                               make_tuple(number<NPerBlock / NLdsLayer>{}, number<NLdsLayer>{})),
+                           make_merge_transform_v3_division_mod(make_tuple(BK0, number<KPack>{}))),
+                make_tuple(sequence<1, 0>{}, sequence<2, 3>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}));
+            return b_lds_block_desc;
+        }
+#else
+        else // B is Row Major
+        {
+            constexpr index_t BlockSize   = Problem::kBlockSize;
+            constexpr index_t VecLoadSize = GetVectorSizeB<Problem>();
+            using TileEncodingPattern     = TileDistributionEncodingPattern2D<BlockSize,
+                                                                          KPerBlock,
+                                                                          NPerBlock,
+                                                                          VecLoadSize,
+                                                                          BTileAccessPattern>;
+
+            constexpr auto BK0 = number<TileEncodingPattern::X1>{};
+            constexpr auto BK1 = number<TileEncodingPattern::Y0>{};
+            // constexpr auto N0 = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I1);
+            constexpr auto N0 = TileEncodingPattern::X0;
+            constexpr auto N1 = NPerBlock / N0;
+
+            using WarpTile         = typename Problem::BlockGemmShape::WarpTile;
+            constexpr auto NPerXdl = number<WarpTile::at(I1)>{};
+
+            // constexpr auto KThreadWrite     =
+            // BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I0);
+            constexpr auto KThreadWrite     = TileEncodingPattern::Y2;
+            constexpr auto K0PerThreadWrite = BK0 / KThreadWrite;
+            constexpr auto KThreadRead      = 64 / NPerXdl;
+            constexpr auto K0PerThreadRead  = BK0 / KThreadRead;
+
+            constexpr auto kfold =
+                (BK1 * N0 * sizeof(BDataType) > 128) ? 1 : 128 / (BK1 * N0 * sizeof(BDataType));
+            constexpr auto KThreadReadPerm =
+                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
+                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
+                    : KThreadRead;
+
+            // 1<=npair<=n0
+            constexpr auto npair = (BK1 * NPerXdl * sizeof(BDataType) > 128)
+                                       ? 1
+                                       : ((128 / (BK1 * NPerXdl * sizeof(BDataType))) > N0
+                                              ? N0
+                                              : 128 / (BK1 * NPerXdl * sizeof(BDataType)));
+
+            constexpr auto b_lds_block_desc = make_naive_tensor_descriptor_packed(
+                make_tuple(number<KThreadWrite / kfold / KThreadReadPerm>{},
+                           number<K0PerThreadWrite>{},
+                           number<KThreadReadPerm * N1>{},
+                           number<kfold * N0 / npair>{},
+                           number<npair>{},
+                           BK1));
+
+            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
+                b_lds_block_desc,
+                make_tuple(
+                    make_pass_through_transform(number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(number<K0PerThreadWrite>{}),
+                    make_xor_transform(
+                        make_tuple(number<KThreadReadPerm * N1>{}, number<kfold * N0 / npair>{})),
+                    make_pass_through_transform(number<npair>{}),
+                    make_pass_through_transform(BK1)),
+                make_tuple(
+                    sequence<0>{}, sequence<1>{}, sequence<2, 3>{}, sequence<4>{}, sequence<5>{}),
+                make_tuple(
+                    sequence<0>{}, sequence<1>{}, sequence<2, 3>{}, sequence<4>{}, sequence<5>{}));
+
+            constexpr auto b_lds_block_desc_unmerged = transform_tensor_descriptor(
+                b_lds_block_desc_permuted,
+                make_tuple(
+                    make_pass_through_transform(number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(number<K0PerThreadWrite>{}),
+                    make_unmerge_transform(make_tuple(number<KThreadReadPerm>{}, number<N1>{})),
+                    make_unmerge_transform(make_tuple(number<kfold>{}, number<N0 / npair>{})),
+                    make_pass_through_transform(number<npair>{}),
+                    make_pass_through_transform(BK1)),
+                make_tuple(sequence<0>{},
+                           sequence<1>{},
+                           sequence<2>{},
+                           sequence<3>{},
+                           sequence<4>{},
+                           sequence<5>{}),
+                make_tuple(sequence<1>{},
+                           sequence<2>{},
+                           sequence<0, 3>{},
+                           sequence<4, 5>{},
+                           sequence<6>{},
+                           sequence<7>{}));
+
+            // constexpr auto b_lds_block_desc_bk0_n_bk1 = transform_tensor_descriptor(
+            //     b_lds_block_desc_unmerged,
+            //     make_tuple(make_merge_transform_v3_division_mod(
+            //                    make_tuple(number<KThreadReadPerm>{},
+            //                               number<KThreadWrite / kfold / KThreadReadPerm>{},
+            //                               number<kfold>{},
+            //                               number<K0PerThreadWrite>{})),
+            //                make_merge_transform_v3_division_mod(
+            //                    make_tuple(number<N0 / npair>{}, number<npair>{}, number<N1>{})),
+            //                make_pass_through_transform(BK1)),
+            //     make_tuple(sequence<0, 1, 4, 2>{}, sequence<5, 6, 3>{}, sequence<7>{}),
+            //     make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}));
+
+            constexpr auto b_lds_block_desc_kn = transform_tensor_descriptor(
+                b_lds_block_desc_unmerged,
+                make_tuple(make_merge_transform_v3_division_mod(
+                               make_tuple(number<KThreadReadPerm>{},
+                                          number<KThreadWrite / kfold / KThreadReadPerm>{},
+                                          number<kfold>{},
+                                          number<K0PerThreadWrite>{},
+                                          BK1)),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(number<N0 / npair>{}, number<npair>{}, number<N1>{}))),
+                make_tuple(sequence<0, 1, 4, 2, 7>{}, sequence<5, 6, 3>{}),
+                make_tuple(sequence<1>{}, sequence<0>{}));
+
+            // return b_lds_block_desc_bk0_n_bk1;
+            return b_lds_block_desc_kn;
+
+            // constexpr auto b_lds_block_desc_bk0_n_bk1 = make_naive_tensor_descriptor(
+            //     make_tuple(BK0, number<NPerBlock>{}, number<KPack>{}),
+            //     make_tuple(number<KPack>{}, number<KPerBlock>{}, number<1>{}),
+            //     number<KPack>{},
+            //     number<1>{});
+
+            // constexpr auto b_lds_block_desc = transform_tensor_descriptor(
+            //     b_lds_block_desc_bk0_n_bk1,
+            //     make_tuple(make_pass_through_transform(number<NPerBlock>{}),
+            //                make_merge_transform_v3_division_mod(make_tuple(BK0,
+            //                number<KPack>{}))),
+            //     make_tuple(sequence<1>{}, sequence<0, 2>{}),
+            //     make_tuple(sequence<0>{}, sequence<1>{}));
+
+            // return b_lds_block_desc;
+        }
+#endif
+    }
+
     /**
      * @brief Get the maximum global memory vector load size.
      *
@@ -252,7 +491,7 @@ struct UniversalGemmBasePolicy
         using ALayout = remove_cvref_t<typename Problem::ALayout>;
         static_assert(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::ColumnMajor>);
         constexpr index_t BlockSize   = Problem::kBlockSize;
-        constexpr index_t MPerBlock   = Problem::BlockGemmShape::kN;
+        constexpr index_t MPerBlock   = Problem::BlockGemmShape::kM;
         constexpr index_t KPerBlock   = Problem::BlockGemmShape::kK;
         constexpr index_t VecLoadSize = GetVectorSizeA<Problem>();
 
@@ -301,7 +540,7 @@ struct UniversalGemmBasePolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeA()
     {
-        constexpr auto a_lds_desc     = Derived::template MakeALdsBlockDescriptor<Problem>();
+        constexpr auto a_lds_desc     = MakeALdsBlockDescriptor<Problem>();
         constexpr index_t smem_size_a = integer_least_multiple(
             sizeof(typename Problem::ADataType) * a_lds_desc.get_element_space_size(), 16);
         return smem_size_a;
@@ -310,7 +549,7 @@ struct UniversalGemmBasePolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeB()
     {
-        constexpr auto b_lds_desc     = Derived::template MakeBLdsBlockDescriptor<Problem>();
+        constexpr auto b_lds_desc     = MakeBLdsBlockDescriptor<Problem>();
         constexpr index_t smem_size_b = integer_least_multiple(
             sizeof(typename Problem::BDataType) * b_lds_desc.get_element_space_size(), 16);
         return smem_size_b;
@@ -330,245 +569,6 @@ struct UniversalGemmBasePolicy
 struct UniversalGemmPipelineAgBgCrPolicy
     : public UniversalGemmBasePolicy<UniversalGemmPipelineAgBgCrPolicy>
 {
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
-    {
-        using ADataType = remove_cvref_t<typename Problem::ADataType>;
-
-        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
-        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
-        constexpr index_t KPack     = GetSmemPackA<Problem>();
-
-        constexpr auto DataTypeSize = sizeof(ADataType);
-        constexpr auto MLdsLayer =
-            (32 * 4 / KPerBlock / DataTypeSize) < 1 ? 1 : (32 * 4 / KPerBlock / DataTypeSize);
-
-        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
-            make_tuple(number<KPerBlock / KPack * MLdsLayer>{},
-                       number<MPerBlock / MLdsLayer>{},
-                       number<KPack>{}),
-            make_tuple(number<KPack>{}, number<KPerBlock * MLdsLayer>{}, number<1>{}),
-            number<KPack>{},
-            number<1>{});
-
-        constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
-            a_lds_block_desc_0,
-            make_tuple(make_xor_transform(make_tuple(number<MPerBlock / MLdsLayer>{},
-                                                     number<KPerBlock / KPack * MLdsLayer>{})),
-                       make_pass_through_transform(number<KPack>{})),
-            make_tuple(sequence<1, 0>{}, sequence<2>{}),
-            make_tuple(sequence<1, 0>{}, sequence<2>{}));
-
-        constexpr auto a_lds_block_desc_xk0_mnldslayer_mn_xk1 = transform_tensor_descriptor(
-            a_lds_block_desc_permuted,
-            make_tuple(make_unmerge_transform(
-                           make_tuple(number<KPerBlock / KPack>{}, number<MLdsLayer>{})),
-                       make_pass_through_transform(number<MPerBlock / MLdsLayer>{}),
-                       make_pass_through_transform(number<KPack>{})),
-            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
-            make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
-
-        constexpr auto a_lds_block_desc = transform_tensor_descriptor(
-            a_lds_block_desc_xk0_mnldslayer_mn_xk1,
-            make_tuple(make_merge_transform_v3_division_mod(
-                           make_tuple(number<MPerBlock / MLdsLayer>{}, number<MLdsLayer>{})),
-                       make_merge_transform_v3_division_mod(
-                           make_tuple(number<KPerBlock / KPack>{}, number<KPack>{}))),
-            make_tuple(sequence<1, 2>{}, sequence<0, 3>{}),
-            make_tuple(sequence<0>{}, sequence<1>{}));
-
-        return a_lds_block_desc;
-    }
-
-    /**
-     * @brief Create LDS block descriptor for B tensor.
-     *
-     * @tparam Problem  Gemm pipeline problem.
-     * @return B tensor LDS block descriptor.
-     */
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeBLdsBlockDescriptor()
-    {
-        // using BLayout   = remove_cvref_t<typename Problem::BLayout>;
-        using BDataType = remove_cvref_t<typename Problem::BDataType>;
-
-        constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
-        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
-
-#if 1
-        // if constexpr(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::ColumnMajor>)
-        {
-            constexpr index_t KPack     = GetSmemPackB<Problem>();
-            constexpr auto BK0          = number<KPerBlock / KPack>{};
-            constexpr auto DataTypeSize = sizeof(BDataType);
-            constexpr auto NLdsLayer =
-                (32 * 4 / KPerBlock / DataTypeSize) < 1 ? 1 : (32 * 4 / KPerBlock / DataTypeSize);
-
-            constexpr auto b_lds_block_desc_0 = make_naive_tensor_descriptor(
-                make_tuple(
-                    BK0 * number<NLdsLayer>{}, number<NPerBlock / NLdsLayer>{}, number<KPack>{}),
-                make_tuple(number<KPack>{}, number<KPerBlock * NLdsLayer>{}, number<1>{}),
-                number<KPack>{},
-                number<1>{});
-
-            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
-                b_lds_block_desc_0,
-                make_tuple(make_xor_transform(make_tuple(number<NPerBlock / NLdsLayer>{},
-                                                         BK0 * number<NLdsLayer>{})),
-                           make_pass_through_transform(number<KPack>{})),
-                make_tuple(sequence<1, 0>{}, sequence<2>{}),
-                make_tuple(sequence<1, 0>{}, sequence<2>{}));
-
-            constexpr auto b_lds_block_desc_bk0_nldslayer_n_bk1 = transform_tensor_descriptor(
-                b_lds_block_desc_permuted,
-                make_tuple(make_unmerge_transform(make_tuple(BK0, number<NLdsLayer>{})),
-                           make_pass_through_transform(number<NPerBlock / NLdsLayer>{}),
-                           make_pass_through_transform(number<KPack>{})),
-                make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
-                make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
-
-            constexpr auto b_lds_block_desc = transform_tensor_descriptor(
-                b_lds_block_desc_bk0_nldslayer_n_bk1,
-                make_tuple(make_merge_transform_v3_division_mod(
-                               make_tuple(number<NPerBlock / NLdsLayer>{}, number<NLdsLayer>{})),
-                           make_merge_transform_v3_division_mod(make_tuple(BK0, number<KPack>{}))),
-                make_tuple(sequence<1, 2>{}, sequence<0, 3>{}),
-                make_tuple(sequence<0>{}, sequence<1>{}));
-            return b_lds_block_desc;
-        }
-#else
-        else // B is Row Major
-        {
-            constexpr index_t BlockSize   = Problem::kBlockSize;
-            constexpr index_t VecLoadSize = GetVectorSizeB<Problem>();
-            using TileEncodingPattern     = TileDistributionEncodingPattern2D<BlockSize,
-                                                                          KPerBlock,
-                                                                          NPerBlock,
-                                                                          VecLoadSize,
-                                                                          BTileAccessPattern>;
-
-            constexpr auto BK0 = number<TileEncodingPattern::X1>{};
-            constexpr auto BK1 = number<TileEncodingPattern::Y0>{};
-            // constexpr auto N0 = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I1);
-            constexpr auto N0 = TileEncodingPattern::X0;
-            constexpr auto N1 = NPerBlock / N0;
-
-            using WarpTile         = typename Problem::BlockGemmShape::WarpTile;
-            constexpr auto NPerXdl = number<WarpTile::at(I1)>{};
-
-            // constexpr auto KThreadWrite     =
-            // BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I0);
-            constexpr auto KThreadWrite     = TileEncodingPattern::Y2;
-            constexpr auto K0PerThreadWrite = BK0 / KThreadWrite;
-            constexpr auto KThreadRead      = 64 / NPerXdl;
-            constexpr auto K0PerThreadRead  = BK0 / KThreadRead;
-
-            constexpr auto kfold =
-                (BK1 * N0 * sizeof(BDataType) > 128) ? 1 : 128 / (BK1 * N0 * sizeof(BDataType));
-            constexpr auto KThreadReadPerm =
-                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
-                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
-                    : KThreadRead;
-
-            // 1<=npair<=n0
-            constexpr auto npair = (BK1 * NPerXdl * sizeof(BDataType) > 128)
-                                       ? 1
-                                       : ((128 / (BK1 * NPerXdl * sizeof(BDataType))) > N0
-                                              ? N0
-                                              : 128 / (BK1 * NPerXdl * sizeof(BDataType)));
-
-            constexpr auto b_lds_block_desc = make_naive_tensor_descriptor_packed(
-                make_tuple(number<KThreadWrite / kfold / KThreadReadPerm>{},
-                           number<K0PerThreadWrite>{},
-                           number<KThreadReadPerm * N1>{},
-                           number<kfold * N0 / npair>{},
-                           number<npair>{},
-                           BK1));
-
-            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
-                b_lds_block_desc,
-                make_tuple(
-                    make_pass_through_transform(number<KThreadWrite / kfold / KThreadReadPerm>{}),
-                    make_pass_through_transform(number<K0PerThreadWrite>{}),
-                    make_xor_transform(
-                        make_tuple(number<KThreadReadPerm * N1>{}, number<kfold * N0 / npair>{})),
-                    make_pass_through_transform(number<npair>{}),
-                    make_pass_through_transform(BK1)),
-                make_tuple(
-                    sequence<0>{}, sequence<1>{}, sequence<2, 3>{}, sequence<4>{}, sequence<5>{}),
-                make_tuple(
-                    sequence<0>{}, sequence<1>{}, sequence<2, 3>{}, sequence<4>{}, sequence<5>{}));
-
-            constexpr auto b_lds_block_desc_unmerged = transform_tensor_descriptor(
-                b_lds_block_desc_permuted,
-                make_tuple(
-                    make_pass_through_transform(number<KThreadWrite / kfold / KThreadReadPerm>{}),
-                    make_pass_through_transform(number<K0PerThreadWrite>{}),
-                    make_unmerge_transform(make_tuple(number<KThreadReadPerm>{}, number<N1>{})),
-                    make_unmerge_transform(make_tuple(number<kfold>{}, number<N0 / npair>{})),
-                    make_pass_through_transform(number<npair>{}),
-                    make_pass_through_transform(BK1)),
-                make_tuple(sequence<0>{},
-                           sequence<1>{},
-                           sequence<2>{},
-                           sequence<3>{},
-                           sequence<4>{},
-                           sequence<5>{}),
-                make_tuple(sequence<1>{},
-                           sequence<2>{},
-                           sequence<0, 3>{},
-                           sequence<4, 5>{},
-                           sequence<6>{},
-                           sequence<7>{}));
-
-            // constexpr auto b_lds_block_desc_bk0_n_bk1 = transform_tensor_descriptor(
-            //     b_lds_block_desc_unmerged,
-            //     make_tuple(make_merge_transform_v3_division_mod(
-            //                    make_tuple(number<KThreadReadPerm>{},
-            //                               number<KThreadWrite / kfold / KThreadReadPerm>{},
-            //                               number<kfold>{},
-            //                               number<K0PerThreadWrite>{})),
-            //                make_merge_transform_v3_division_mod(
-            //                    make_tuple(number<N0 / npair>{}, number<npair>{}, number<N1>{})),
-            //                make_pass_through_transform(BK1)),
-            //     make_tuple(sequence<0, 1, 4, 2>{}, sequence<5, 6, 3>{}, sequence<7>{}),
-            //     make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}));
-
-            constexpr auto b_lds_block_desc_kn = transform_tensor_descriptor(
-                b_lds_block_desc_unmerged,
-                make_tuple(make_merge_transform_v3_division_mod(
-                               make_tuple(number<KThreadReadPerm>{},
-                                          number<KThreadWrite / kfold / KThreadReadPerm>{},
-                                          number<kfold>{},
-                                          number<K0PerThreadWrite>{},
-                                          BK1)),
-                           make_merge_transform_v3_division_mod(
-                               make_tuple(number<N0 / npair>{}, number<npair>{}, number<N1>{}))),
-                make_tuple(sequence<0, 1, 4, 2, 7>{}, sequence<5, 6, 3>{}),
-                make_tuple(sequence<1>{}, sequence<0>{}));
-
-            // return b_lds_block_desc_bk0_n_bk1;
-            return b_lds_block_desc_kn;
-
-            // constexpr auto b_lds_block_desc_bk0_n_bk1 = make_naive_tensor_descriptor(
-            //     make_tuple(BK0, number<NPerBlock>{}, number<KPack>{}),
-            //     make_tuple(number<KPack>{}, number<KPerBlock>{}, number<1>{}),
-            //     number<KPack>{},
-            //     number<1>{});
-
-            // constexpr auto b_lds_block_desc = transform_tensor_descriptor(
-            //     b_lds_block_desc_bk0_n_bk1,
-            //     make_tuple(make_pass_through_transform(number<NPerBlock>{}),
-            //                make_merge_transform_v3_division_mod(make_tuple(BK0,
-            //                number<KPack>{}))),
-            //     make_tuple(sequence<1>{}, sequence<0, 2>{}),
-            //     make_tuple(sequence<0>{}, sequence<1>{}));
-
-            // return b_lds_block_desc;
-        }
-#endif
-    }
-
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
     {
@@ -580,7 +580,9 @@ struct UniversalGemmPipelineAgBgCrPolicy
                                                 WarpTile::at(I0),
                                                 WarpTile::at(I1),
                                                 WarpTile::at(I2),
-                                                Problem::TransposeC>;
+                                                Problem::TransposeC,
+                                                false,
+                                                Problem::UseStructuredSparsity>;
         using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
                                                                       typename Problem::BDataType,
                                                                       typename Problem::CDataType,
diff --git a/include/ck_tile/ops/gemm/pipeline/tile_gemm_shape.hpp b/include/ck_tile/ops/gemm/pipeline/tile_gemm_shape.hpp
index 24a399f18d..f0aa4472e1 100644
--- a/include/ck_tile/ops/gemm/pipeline/tile_gemm_shape.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/tile_gemm_shape.hpp
@@ -8,7 +8,11 @@
 
 namespace ck_tile {
 
-template <typename BlockTile_, typename BlockWarps_, typename WarpTile_>
+template <typename BlockTile_,
+          typename BlockWarps_,
+          typename WarpTile_,
+          bool PermuteA_ = false,
+          bool PermuteB_ = false>
 struct TileGemmShape
 {
     using BlockTile  = remove_cvref_t<BlockTile_>;
@@ -21,6 +25,9 @@ struct TileGemmShape
     static constexpr index_t kN = BlockTile::at(number<1>{});
     static constexpr index_t kK = BlockTile::at(number<2>{});
 
+    static constexpr bool PermuteA = PermuteA_;
+    static constexpr bool PermuteB = PermuteB_;
+
     CK_TILE_HOST static std::string GetName()
     {
         // clang-format off
diff --git a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
index d0e1f60d38..a31004b425 100644
--- a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -26,7 +26,8 @@ struct TileGemmTraits
     using BLayout = BLayout_;
     using CLayout = CLayout_;
 
-    static constexpr bool TransposeC = false;
+    static constexpr bool TransposeC            = false;
+    static constexpr bool UseStructuredSparsity = false;
 };
 
 template <bool kPadM_,
@@ -36,7 +37,8 @@ template <bool kPadM_,
           typename ALayout_,
           typename BLayout_,
           typename CLayout_,
-          bool TransposeC_ = false>
+          bool TransposeC_            = false,
+          bool UseStructuredSparsity_ = false>
 struct TileGemmUniversalTraits
 {
     static constexpr bool kPadM = kPadM_;
@@ -49,7 +51,8 @@ struct TileGemmUniversalTraits
     using BLayout = BLayout_;
     using CLayout = CLayout_;
 
-    static constexpr bool TransposeC = TransposeC_;
+    static constexpr bool TransposeC            = TransposeC_;
+    static constexpr bool UseStructuredSparsity = UseStructuredSparsity_;
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index 1fd12973f6..22962b9404 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -7,6 +7,9 @@
 #include "ck_tile/ops/gemm/warp/warp_gemm_impl.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp"
 
+#include "ck_tile/ops/gemm/warp/warp_gemm_smfmac_impl.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac.hpp"
+
 namespace ck_tile {
 
 // fp16
@@ -17,13 +20,24 @@ using WarpGemmMfmaF16F16F32M32N32K8 = WarpGemmImpl<
 using WarpGemmMfmaF16F16F32M16N16K16 = WarpGemmImpl<
     WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
 
+#if defined(__gfx950__)
+using WarpGemmMfmaF16F16F32M32N32K16 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
+
+#else
 using WarpGemmMfmaF16F16F32M32N32K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
     WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
     2>>;
+#endif
 
+#if defined(__gfx950__)
+using WarpGemmMfmaF16F16F32M16N16K32 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>>>;
+#else
 using WarpGemmMfmaF16F16F32M16N16K32 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
     WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>,
     2>>;
+#endif
 
 using WarpGemmMfmaF16F16F32M32N32K8SwizzleA = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK_SwizzleA<
     WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
@@ -41,20 +55,38 @@ using WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
 
+#if defined(__gfx950__)
+using WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
+#else
 using WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
         WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
         2>>;
+#endif
 
+#if defined(__gfx950__)
+using WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>>>;
+#else
 using WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
         WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>,
         2>>;
+#endif
 
+#if defined(__gfx950__)
+using WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
+        WarpGemmAttributeMfmaImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
+#else
 using WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
         WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
         2>>;
+#endif
 
 using WarpGemmMfmaF16F16F32M4N64K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
     WarpGemmAttributeMfmaImplF16F16F32M4N64K4<WGAttrCtlEnum::Default_>,
@@ -64,21 +96,39 @@ using WarpGemmMfmaF16F16F32M64N4K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK
     WarpGemmAttributeMfmaImplF16F16F32M64N4K4<WGAttrCtlEnum::Default_>,
     4>>;
 
-// bf16
+// fp16 2:4 structured sparsity
 
+using WarpGemmSmfmacF16F16F32M32N32K16 = WarpGemmSmfmacImpl<WarpGemmAttributeSmfmac<
+    WarpGemmAttributeSmfmacImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
+
+using WarpGemmSmfmacF16F16F32M16N16K32 = WarpGemmSmfmacImpl<WarpGemmAttributeSmfmac<
+    WarpGemmAttributeSmfmacImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>>>;
+
+// bf16
 using WarpGemmMfmaBf16Bf16F32M32N32K8 = WarpGemmImpl<
     WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfmaBf16Bf16F32M16N16K16 = WarpGemmImpl<
     WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
 
+#if defined(__gfx950__)
+using WarpGemmMfmaBf16Bf16F32M32N32K16 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
+
+#else
 using WarpGemmMfmaBf16Bf16F32M32N32K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
     WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
     2>>;
+#endif
 
+#if defined(__gfx950__)
+using WarpGemmMfmaBf16Bf16F32M16N16K32 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K32<WGAttrCtlEnum::Default_>>>;
+#else
 using WarpGemmMfmaBf16Bf16F32M16N16K32 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
     WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>,
     2>>;
+#endif
 
 using WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK_SwizzleA<
     WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
@@ -97,20 +147,38 @@ using WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
 
+#if defined(__gfx950__)
+using WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
+#else
 using WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
         WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
         2>>;
+#endif
 
+#if defined(__gfx950__)
+using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
+#else
 using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
         WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>,
         2>>;
+#endif
 
+#if defined(__gfx950__)
+using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
+        WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
+#else
 using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
         WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
         2>>;
+#endif
 
 using WarpGemmMfmaBf16Bf16F32M4N64K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
     WarpGemmAttributeMfmaImplBf16Bf16F32M4N64K4<WGAttrCtlEnum::Default_>,
@@ -134,6 +202,32 @@ using WarpGemmMfma_f32_32x32x16_bf8_fp8 = WarpGemmImpl<
 using WarpGemmMfma_f32_32x32x16_bf8_bf8 = WarpGemmImpl<
     WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8<WGAttrCtlEnum::Default_>>>;
 
+using WarpGemmMfma_f32_16x16x64_fp8_fp8 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+    WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8<WGAttrCtlEnum::Default_>,
+    2>>;
+
+using WarpGemmMfma_f32_16x16x32_fp8_fp8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8<WGAttrCtlEnum::Default_>>>;
+
+using WarpGemmMfma_f32_16x16x64_bf8_bf8 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+    WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8<WGAttrCtlEnum::Default_>,
+    2>>;
+
+using WarpGemmMfma_f32_16x16x32_bf8_bf8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8<WGAttrCtlEnum::Default_>>>;
+
+using WarpGemmMfma_f32_16x16x128_fp8_fp8 = WarpGemmImpl<WarpGemmAtrributeMfma<
+    WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_fp8<WGAttrCtlEnum::Default_>>>;
+
+using WarpGemmMfma_f32_16x16x128_fp8_bf8 = WarpGemmImpl<WarpGemmAtrributeMfma<
+    WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_bf8<WGAttrCtlEnum::Default_>>>;
+
+using WarpGemmMfma_f32_16x16x128_bf8_fp8 = WarpGemmImpl<WarpGemmAtrributeMfma<
+    WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_fp8<WGAttrCtlEnum::Default_>>>;
+
+using WarpGemmMfma_f32_16x16x128_bf8_bf8 = WarpGemmImpl<WarpGemmAtrributeMfma<
+    WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8<WGAttrCtlEnum::Default_>>>;
+
 using WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed =
     WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_>>>;
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
index e7d4c37966..93ccdb5f57 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
@@ -356,7 +356,7 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution
     }
 };
 
-template <typename WarpGemmAttributeMfmaImpl_>
+template <typename WarpGemmAttributeMfmaImpl_, index_t SFactor_ = 2>
 struct WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB
 {
     using Impl = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
@@ -373,6 +373,7 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB
     static constexpr index_t kN          = Impl::kM;
     static constexpr index_t kK          = Impl::kK;
     static constexpr index_t kKPerThread = Impl::kABKPerLane;
+    static constexpr index_t SFactor     = SFactor_; // group how many CM1 together
 
     CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; }
 
@@ -386,7 +387,7 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB
         tuple<sequence<0, 0>>,
         sequence<2>,
         sequence<1>>;
-
+#if 0
     using BWarpDstrEncoding = tile_distribution_encoding<
         sequence<>,
         tuple<sequence<Impl::kAMLane / (Impl::kABKPerLane * Impl::kABKLane * 2),
@@ -407,7 +408,29 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB
         tuple<sequence<1, 0>>,
         sequence<2, 2>,
         sequence<0, 2>>;
+#else
+    // TODO: more test not only 32x32
+    using BWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kAMLane / (Impl::kCMLane * SFactor * Impl::kCM1PerLane),
+                       Impl::kCMLane,
+                       SFactor,
+                       Impl::kCM1PerLane>,
+              sequence<Impl::kABKLane, Impl::kABKPerLane>>,
+        tuple<sequence<2, 1, 1, 1, 1>>,
+        tuple<sequence<0, 0, 2, 1, 3>>,
+        sequence<2>,
+        sequence<1>>;
 
+    using CWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kCNLane>,
+              sequence<Impl::kCM0PerLane / SFactor, Impl::kCMLane, Impl::kCM1PerLane * SFactor>>,
+        tuple<sequence<2, 1>>,
+        tuple<sequence<1, 0>>,
+        sequence<2, 2>,
+        sequence<0, 2>>;
+#endif
     template <bool post_nop_ = false>
     // c_vec += a_vec * b_vec
     CK_TILE_DEVICE void operator()(CVecType& c_vec,
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
index 21a865e792..cd32f35180 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
@@ -61,6 +61,69 @@ enum class WGAttrCtlEnum
         DISPATCH_MFMA_(mfma_, "+a", "v", "v", "a")     \
     }
 
+// V_MFMA_F32_16x16x32_BF16
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K32
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = bf16_t;
+    using BDataType                     = bf16_t;
+    using CDataType                     = float;
+
+    using AVecType = ext_vector_t<bf16_t, 8>;
+    using BVecType = ext_vector_t<bf16_t, 8>;
+    using CVecType = ext_vector_t<float, 4>;
+
+    static constexpr index_t kM = 16;
+    static constexpr index_t kN = 16;
+    static constexpr index_t kK = 32;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
+    static constexpr index_t kAMLane     = 16;
+    static constexpr index_t kBNLane     = 16;
+    static constexpr index_t kABKLane    = 4;
+    static constexpr index_t kABKPerLane = 8;
+
+    static constexpr index_t kCMLane     = 4;
+    static constexpr index_t kCNLane     = 16;
+    static constexpr index_t kCM0PerLane = 1;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        DISPATCH_MFMA_CTRL_("v_mfma_f32_16x16x32_bf16", Ctrl)
+        else
+        {
+#if defined(__gfx950__)
+            c_vec = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a_vec, b_vec, c_vec, 0, 0, 0);
+#else
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
+#endif
+        }
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx950__)
+        return bit_cast<CVecType>(
+            __builtin_amdgcn_mfma_f32_16x16x32_bf16(a_vec, b_vec, fp32x4_t{0.f}, 0, 0, 0));
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
 // FP16
 template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 struct WarpGemmAttributeMfmaImplF16F16F32M32N32K8
@@ -188,6 +251,69 @@ struct WarpGemmAttributeMfmaImplF16F16F32M16N16K16
     }
 };
 
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeMfmaImplF16F16F32M16N16K32
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = fp16_t;
+    using BDataType                     = fp16_t;
+    using CDataType                     = float;
+
+    using AVecType = ext_vector_t<fp16_t, 8>;
+    using BVecType = ext_vector_t<fp16_t, 8>;
+    using CVecType = ext_vector_t<float, 4>;
+
+    static constexpr index_t kM = 16;
+    static constexpr index_t kN = 16;
+    static constexpr index_t kK = 32;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
+    static constexpr index_t kAMLane     = 16;
+    static constexpr index_t kBNLane     = 16;
+    static constexpr index_t kABKLane    = 4;
+    static constexpr index_t kABKPerLane = 8;
+
+    static constexpr index_t kCMLane     = 4;
+    static constexpr index_t kCNLane     = 16;
+    static constexpr index_t kCM0PerLane = 1;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        DISPATCH_MFMA_CTRL_("v_mfma_f32_16x16x32f16", Ctrl)
+        else
+        {
+#if defined(__gfx950__)
+            c_vec = __builtin_amdgcn_mfma_f32_16x16x32_f16(a_vec, b_vec, c_vec, 0, 0, 0);
+#else
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
+#endif
+        }
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx950__)
+        return bit_cast<CVecType>(
+            __builtin_amdgcn_mfma_f32_16x16x32_f16(a_vec, b_vec, fp32x4_t{0.f}, 0, 0, 0));
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
 template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 struct WarpGemmAttributeMfmaImplF16F16F32M4N64K4
 {
@@ -622,7 +748,395 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M64N4K4
     }
 };
 
+// gfx950
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeMfmaImplF16F16F32M32N32K16
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = fp16_t;
+    using BDataType                     = fp16_t;
+    using CDataType                     = float;
+
+    using AVecType = ext_vector_t<fp16_t, 8>;
+    using BVecType = ext_vector_t<fp16_t, 8>;
+    using CVecType = ext_vector_t<float, 16>;
+
+    static constexpr index_t kM = 32;
+    static constexpr index_t kN = 32;
+    static constexpr index_t kK = 16;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
+    static constexpr index_t kAMLane     = 32;
+    static constexpr index_t kBNLane     = 32;
+    static constexpr index_t kABKLane    = 2;
+    static constexpr index_t kABKPerLane = 8;
+
+    static constexpr index_t kCMLane     = 2;
+    static constexpr index_t kCNLane     = 32;
+    static constexpr index_t kCM0PerLane = 4;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        DISPATCH_MFMA_CTRL_("v_mfma_f32_32x32x16_f16", Ctrl)
+        else
+        {
+#if defined(__gfx950__)
+            c_vec = __builtin_amdgcn_mfma_f32_32x32x16_f16(a_vec, b_vec, c_vec, 0, 0, 0);
+#elif defined(__gfx90a__) || defined(__gfx94__)
+            static_for<0, 2, 1>{}([&](auto k) {
+                c_vec = __builtin_amdgcn_mfma_f32_32x32x8f16(
+                    reinterpret_cast<const thread_buffer<ADataType, 8>&>(a_vec)
+                        .template get_as<ext_vector_t<fp16_t, 4>>()[number<k>{}],
+                    reinterpret_cast<const thread_buffer<BDataType, 8>&>(b_vec)
+                        .template get_as<ext_vector_t<fp16_t, 4>>()[number<k>{}],
+                    c_vec,
+                    0,
+                    0,
+                    0);
+            });
+#elif defined(__gfx908__)
+            static_for<0, 4, 1>{}([&](auto k) {
+                c_vec = __builtin_amdgcn_mfma_f32_32x32x4f16(
+                    reinterpret_cast<const thread_buffer<ADataType, 4>&>(a_vec)
+                        .template get_as<ext_vector_t<fp16_t, 2>>()[number<k>{}],
+                    reinterpret_cast<const thread_buffer<BDataType, 4>&>(b_vec)
+                        .template get_as<ext_vector_t<fp16_t, 2>>()[number<k>{}],
+                    c_vec,
+                    0,
+                    0,
+                    0);
+            });
+#else
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
+#endif
+        }
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx950__)
+        return __builtin_amdgcn_mfma_f32_32x32x16_f16(a_vec, b_vec, fp32x16_t{0.f}, 0, 0, 0);
+#elif defined(__gfx90a__) || defined(__gfx94__)
+        CVecType c_vec{0.f};
+        static_for<0, 2, 1>{}([&](auto k) {
+            c_vec = __builtin_amdgcn_mfma_f32_32x32x8f16(
+                reinterpret_cast<const thread_buffer<ADataType, 8>&>(a_vec)
+                    .template get_as<ext_vector_t<fp16_t, 4>>()[number<k>{}],
+                reinterpret_cast<const thread_buffer<BDataType, 8>&>(b_vec)
+                    .template get_as<ext_vector_t<fp16_t, 4>>()[number<k>{}],
+                c_vec,
+                0,
+                0,
+                0);
+        });
+        return c_vec;
+#elif defined(__gfx908__)
+        CVecType c_vec{0.f};
+        static_for<0, 4, 1>{}([&](auto k) {
+            c_vec = __builtin_amdgcn_mfma_f32_32x32x4f16(
+                reinterpret_cast<const thread_buffer<ADataType, 4>&>(a_vec)
+                    .template get_as<ext_vector_t<fp16_t, 2>>()[number<k>{}],
+                reinterpret_cast<const thread_buffer<BDataType, 4>&>(b_vec)
+                    .template get_as<ext_vector_t<fp16_t, 2>>()[number<k>{}],
+                c_vec,
+                0,
+                0,
+                0);
+        });
+        return c_vec;
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K16
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = bf16_t;
+    using BDataType                     = bf16_t;
+    using CDataType                     = float;
+
+    using AVecType = ext_vector_t<bf16_t, 8>;
+    using BVecType = ext_vector_t<bf16_t, 8>;
+    using CVecType = ext_vector_t<float, 16>;
+
+    static constexpr index_t kM = 32;
+    static constexpr index_t kN = 32;
+    static constexpr index_t kK = 16;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
+    static constexpr index_t kAMLane     = 32;
+    static constexpr index_t kBNLane     = 32;
+    static constexpr index_t kABKLane    = 2;
+    static constexpr index_t kABKPerLane = 8;
+
+    static constexpr index_t kCMLane     = 2;
+    static constexpr index_t kCNLane     = 32;
+    static constexpr index_t kCM0PerLane = 4;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        DISPATCH_MFMA_CTRL_("v_mfma_f32_32x32x16_bf16", Ctrl)
+        else
+        {
+#if defined(__gfx950__)
+            c_vec = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a_vec, b_vec, c_vec, 0, 0, 0);
+#elif defined(__gfx90a__) || defined(__gfx94__)
+            static_for<0, 2, 1>{}([&](auto k) {
+                c_vec = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(
+                    reinterpret_cast<const thread_buffer<ADataType, 8>&>(a_vec)
+                        .template get_as<ext_vector_t<bf16_t, 4>>()[number<k>{}],
+                    reinterpret_cast<const thread_buffer<BDataType, 8>&>(b_vec)
+                        .template get_as<ext_vector_t<bf16_t, 4>>()[number<k>{}],
+                    c_vec,
+                    0,
+                    0,
+                    0);
+            });
+#elif defined(__gfx908__)
+            static_for<0, 4, 1>{}([&](auto k) {
+                c_vec = __builtin_amdgcn_mfma_f32_32x32x4bf16(
+                    reinterpret_cast<const thread_buffer<ADataType, 4>&>(a_vec)
+                        .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                    reinterpret_cast<const thread_buffer<BDataType, 4>&>(b_vec)
+                        .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                    c_vec,
+                    0,
+                    0,
+                    0);
+            });
+#else
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
+#endif
+        }
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx950__)
+        return __builtin_amdgcn_mfma_f32_32x32x16_bf16(a_vec, b_vec, fp32x16_t{0.f}, 0, 0, 0);
+#elif defined(__gfx90a__) || defined(__gfx94__)
+        CVecType c_vec{0.f};
+        static_for<0, 2, 1>{}([&](auto k) {
+            c_vec = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(
+                reinterpret_cast<const thread_buffer<ADataType, 8>&>(a_vec)
+                    .template get_as<ext_vector_t<bf16_t, 4>>()[number<k>{}],
+                reinterpret_cast<const thread_buffer<BDataType, 8>&>(b_vec)
+                    .template get_as<ext_vector_t<bf16_t, 4>>()[number<k>{}],
+                c_vec,
+                0,
+                0,
+                0);
+        });
+        return c_vec;
+#elif defined(__gfx908__)
+        CVecType c_vec{0.f};
+        static_for<0, 4, 1>{}([&](auto k) {
+            c_vec = __builtin_amdgcn_mfma_f32_32x32x4bf16(
+                reinterpret_cast<const thread_buffer<ADataType, 4>&>(a_vec)
+                    .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                reinterpret_cast<const thread_buffer<BDataType, 4>&>(b_vec)
+                    .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                c_vec,
+                0,
+                0,
+                0);
+        });
+        return c_vec;
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
 // FP8
+template <typename AType_, typename BType_, WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeMfmaImpl_f32_16x16x32_f8_base
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = AType_;
+    using BDataType                     = BType_;
+    using CDataType                     = float;
+
+    using AVecType = ext_vector_t<ADataType, 8>;
+    using BVecType = ext_vector_t<BDataType, 8>;
+    using CVecType = ext_vector_t<CDataType, 4>;
+
+    static constexpr index_t kM = 16;
+    static constexpr index_t kN = 16;
+    static constexpr index_t kK = 32;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
+    static constexpr index_t kAMLane     = 16;
+    static constexpr index_t kBNLane     = 16;
+    static constexpr index_t kABKLane    = 4;
+    static constexpr index_t kABKPerLane = 8;
+
+    static constexpr index_t kCMLane     = 4;
+    static constexpr index_t kCNLane     = 16;
+    static constexpr index_t kCM0PerLane = 1;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        if constexpr(Ctrl == WGAttrCtlEnum::Raw_vvv)
+        {
+            if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_fp8_fp8", "+v", "v", "v", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_116x16x32_fp8_bf8", "+v", "v", "v", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_bf8_fp8", "+v", "v", "v", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_bf8_bf8", "+v", "v", "v", "v")
+            }
+        }
+        else if constexpr(Ctrl == WGAttrCtlEnum::Raw_vaa)
+        {
+            if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_fp8_fp8", "+v", "a", "a", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_fp8_bf8", "+v", "a", "a", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_bf8_fp8", "+v", "a", "a", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_bf8_bf8", "+v", "a", "a", "v")
+            }
+        }
+        else if constexpr(Ctrl == WGAttrCtlEnum::Raw_vav)
+        {
+            if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_fp8_fp8", "+v", "a", "v", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_fp8_bf8", "+v", "a", "v", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_bf8_fp8", "+v", "a", "v", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_bf8_bf8", "+v", "a", "v", "v")
+            }
+        }
+        else if constexpr(Ctrl == WGAttrCtlEnum::Raw_vva)
+        {
+            if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_fp8_fp8", "+v", "v", "a", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_fp8_bf8", "+v", "v", "a", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_bf8_fp8", "+v", "v", "a", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_bf8_bf8", "+v", "v", "a", "v")
+            }
+        }
+        else
+        {
+#if defined(__gfx94__)
+            if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+                c_vec = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(
+                    bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+            else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+                c_vec = __builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8(
+                    bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+                c_vec = __builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8(
+                    bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+                c_vec = __builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8(
+                    bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+#else
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
+#endif
+        }
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx94__)
+        if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(
+                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
+        else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8(
+                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
+        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8(
+                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
+        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_316x16x32_bf8_bf8(
+                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
 template <typename AType_, typename BType_, WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
 {
@@ -809,11 +1323,17 @@ struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
 template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 using WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8 =
     WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<fp8_t, fp8_t, Ctrl_>;
-
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+using WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8 =
+    WarpGemmAttributeMfmaImpl_f32_16x16x32_f8_base<fp8_t, fp8_t, Ctrl_>;
 template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 using WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8 =
     WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<fp8_t, bf8_t, Ctrl_>;
 
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+using WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8 =
+    WarpGemmAttributeMfmaImpl_f32_16x16x32_f8_base<bf8_t, bf8_t, Ctrl_>;
+
 template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 using WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8 =
     WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<bf8_t, fp8_t, Ctrl_>;
@@ -822,6 +1342,104 @@ template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 using WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8 =
     WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<bf8_t, bf8_t, Ctrl_>;
 
+template <typename AType_, typename BType_, WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = AType_;
+    using BDataType                     = BType_;
+    using CDataType                     = float;
+
+    using AVecType = ext_vector_t<ADataType, 32>;
+    using BVecType = ext_vector_t<BDataType, 32>;
+    using CVecType = ext_vector_t<CDataType, 4>;
+
+    static constexpr index_t kM = 16;
+    static constexpr index_t kN = 16;
+    static constexpr index_t kK = 128;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
+    static constexpr index_t kAMLane     = 16;
+    static constexpr index_t kBNLane     = 16;
+    static constexpr index_t kABKLane    = 4;
+    static constexpr index_t kABKPerLane = 32;
+
+    static constexpr index_t kCMLane     = 4;
+    static constexpr index_t kCNLane     = 16;
+    static constexpr index_t kCM0PerLane = 1;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        //__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, cbsz, blgp, opsel, scale_a,
+        // opsel, scale_b)
+#if defined(__gfx950__)
+        if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            c_vec = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                a_vec, b_vec, c_vec, 0, 0, 0, 0, 0, 0);
+        else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            c_vec = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                a_vec, b_vec, c_vec, 0, 1, 0, 0, 0, 0);
+        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            c_vec = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                a_vec, b_vec, c_vec, 1, 0, 0, 0, 0, 0);
+        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            c_vec = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                a_vec, b_vec, c_vec, 1, 1, 0, 0, 0, 0);
+#else
+        ck_tile::ignore = c_vec;
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+#endif
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx950__)
+        if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                a_vec, b_vec, CVecType{0.f}, 0, 0, 0, 0, 0, 0));
+        else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                a_vec, b_vec, CVecType{0.f}, 0, 1, 0, 0, 0, 0));
+        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                a_vec, b_vec, CVecType{0.f}, 1, 0, 0, 0, 0, 0));
+        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                a_vec, b_vec, CVecType{0.f}, 1, 1, 0, 0, 0, 0));
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+using WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_fp8 =
+    WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base<fp8_t, fp8_t, Ctrl_>;
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+using WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_bf8 =
+    WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base<fp8_t, bf8_t, Ctrl_>;
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+using WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_fp8 =
+    WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base<bf8_t, fp8_t, Ctrl_>;
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+using WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8 =
+    WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base<bf8_t, bf8_t, Ctrl_>;
+
 // int8
 template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 struct WarpGemmAttributeMfmaImpl_i32_32x32x16_i8
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac.hpp
new file mode 100644
index 0000000000..84cdf17d66
--- /dev/null
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac.hpp
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac_impl.hpp"
+
+namespace ck_tile {
+
+/**
+ *  @brief Class describing structured sparsity mfma instructions.
+ *
+ * @paragraph Overview "Overview"
+ * Currently only 2:4 structured sparsity is supported, which is based on requirement that in every
+ * groups of four continuous elements there are at most two non-zero, which results in processing
+ * only half of elements in smfmac instruction. Because of structured sparsity A vector in smfmac
+ * instruction will be smaller than B vector by the factor of CompressionRatio. The indexes of
+ * non-zero elements are stored in `index` which is an additional parameter to assembly instruction.
+ * Every pair of two bit indexes are containing information about which two  elements in current
+ * group of 4 values are non-zero and should be used inside smfmac instruction. Structured sparsity
+ * format is supported only for A matrix for now.
+ */
+template <typename WarpGemmAttributeSmfmacImpl_>
+struct WarpGemmAttributeSmfmac
+{
+    using Impl = remove_cvref_t<WarpGemmAttributeSmfmacImpl_>;
+
+    using ADataType   = typename Impl::ADataType;
+    using BDataType   = typename Impl::BDataType;
+    using IdxDataType = typename Impl::IdxDataType;
+    using CDataType   = typename Impl::CDataType;
+
+    using AVecType = typename Impl::AVecType;
+    using BVecType = typename Impl::BVecType;
+    using CVecType = typename Impl::CVecType;
+
+    static constexpr index_t kM                = Impl::kM;
+    static constexpr index_t kN                = Impl::kN;
+    static constexpr index_t kK                = Impl::kK;
+    static constexpr index_t kKPerThread       = Impl::kABKPerLane;
+    static constexpr index_t kCompressionRatio = Impl::CompressionRatio;
+
+    CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; }
+
+    static_assert(Impl::kAMBlock == 1 && Impl::kBNBlock == 1,
+                  "Multi-block WarpGemmAttributeSmfmacImpl is not supported");
+
+    using AWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kAMLane>, sequence<Impl::kABKLane, Impl::kABKPerLane>>,
+        tuple<sequence<2, 1>>,
+        tuple<sequence<0, 0>>,
+        sequence<2>,
+        sequence<1>>;
+
+    using BWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kBNLane>, sequence<Impl::kABKLane, Impl::kABKPerLane>>,
+        tuple<sequence<2, 1>>,
+        tuple<sequence<0, 0>>,
+        sequence<2>,
+        sequence<1>>;
+
+    using CWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kCM0PerLane, Impl::kCMLane, Impl::kCM1PerLane>,
+              sequence<Impl::kCNLane>>,
+        tuple<sequence<1, 2>>,
+        tuple<sequence<1, 0>>,
+        sequence<1, 1>,
+        sequence<0, 2>>;
+
+    // c_vec += a_vec * b_vec[idx]
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   const int32_t& idx,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        Impl{}(c_vec, a_vec, b_vec, idx, bool_constant<post_nop_>{});
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac_impl.hpp
new file mode 100644
index 0000000000..97fd2a8742
--- /dev/null
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac_impl.hpp
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "warp_gemm_attribute_mfma_impl.hpp"
+
+namespace ck_tile {
+
+// fp16 2:4 structured sparsity
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeSmfmacImplF16F16F32M32N32K16
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = fp16_t;
+    using BDataType                     = fp16_t;
+    using IdxDataType                   = int32_t;
+    using CDataType                     = float;
+
+    using AVecType = ext_vector_t<fp16_t, 4>;
+    using BVecType = ext_vector_t<fp16_t, 8>;
+    using CVecType = ext_vector_t<float, 16>;
+
+    static constexpr index_t kM = 32;
+    static constexpr index_t kN = 32;
+    static constexpr index_t kK = 16;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
+    static constexpr index_t kAMLane     = 32;
+    static constexpr index_t kBNLane     = 32;
+    static constexpr index_t kABKLane    = 2;
+    static constexpr index_t kABKPerLane = 8;
+
+    static constexpr index_t kCMLane     = 2;
+    static constexpr index_t kCNLane     = 32;
+    static constexpr index_t kCM0PerLane = 4;
+    static constexpr index_t kCM1PerLane = 4;
+
+    static constexpr index_t CompressionRatio = 2;
+
+    // c_vec += a_vec * b_vec[idx]
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   const int32_t& idx,
+                                   bool_constant<post_nop_> = {}) const
+    {
+#if defined(__gfx9__)
+        c_vec = __builtin_amdgcn_smfmac_f32_32x32x16_f16(a_vec, b_vec, c_vec, idx, 0, 0);
+#else
+        ck_tile::ignore = c_vec;
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        ck_tile::ignore = idx;
+#endif
+    }
+};
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeSmfmacImplF16F16F32M16N16K32
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = fp16_t;
+    using BDataType                     = fp16_t;
+    using IdxDataType                   = int32_t;
+    using CDataType                     = float;
+
+    using AVecType = ext_vector_t<fp16_t, 4>;
+    using BVecType = ext_vector_t<fp16_t, 8>;
+    using CVecType = ext_vector_t<float, 4>;
+
+    static constexpr index_t kM = 16;
+    static constexpr index_t kN = 16;
+    static constexpr index_t kK = 32;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
+    static constexpr index_t kAMLane     = 16;
+    static constexpr index_t kBNLane     = 16;
+    static constexpr index_t kABKLane    = 4;
+    static constexpr index_t kABKPerLane = 8;
+
+    static constexpr index_t kCMLane     = 4;
+    static constexpr index_t kCNLane     = 16;
+    static constexpr index_t kCM0PerLane = 1;
+    static constexpr index_t kCM1PerLane = 4;
+
+    static constexpr index_t CompressionRatio = 2;
+
+    // c_vec += a_vec * b_vec[idx]
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   const int32_t& idx,
+                                   bool_constant<post_nop_> = {}) const
+    {
+#if defined(__gfx9__)
+        c_vec = __builtin_amdgcn_smfmac_f32_16x16x32_f16(a_vec, b_vec, c_vec, idx, 0, 0);
+#else
+        ck_tile::ignore = c_vec;
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        ck_tile::ignore = idx;
+#endif
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
index 9c319b5e5f..0e3342c479 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -16,7 +16,8 @@ template <typename AType,
           index_t NPerWave,
           index_t KPerWave,
           bool TransposeC,
-          bool SwizzleA = false>
+          bool SwizzleA              = false,
+          bool UseStructuredSparsity = false>
 struct WarpGemmMfmaDispatcher;
 
 // clang-format off
@@ -35,6 +36,10 @@ template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleA; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleA; };
 
+// fp16 2:4 structural sparsity
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, false, true> { using Type = WarpGemmSmfmacF16F16F32M32N32K16; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, false, false, true> { using Type = WarpGemmSmfmacF16F16F32M16N16K32; };
+
 // bf16
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution; };
@@ -52,14 +57,23 @@ template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float
 
 // fp8
 template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  32, false> { using Type = WarpGemmMfma_f32_16x16x32_fp8_fp8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  64, false> { using Type = WarpGemmMfma_f32_16x16x64_fp8_fp8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  32, false> { using Type = WarpGemmMfma_f32_16x16x32_bf8_bf8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  64, false> { using Type = WarpGemmMfma_f32_16x16x64_bf8_bf8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed; };
 
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_fp8_fp8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_fp8_bf8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_bf8_fp8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_bf8_bf8; };
+
 // clang-format on
 } // namespace impl
 
@@ -70,7 +84,8 @@ template <typename AType,
           index_t NPerWave,
           index_t KPerWave,
           bool TransposeC,
-          bool SwizzleA = false>
+          bool SwizzleA              = false,
+          bool UseStructuredSparsity = false>
 using WarpGemmMfmaDispatcher = typename impl::WarpGemmMfmaDispatcher<AType,
                                                                      BType,
                                                                      CType,
@@ -78,6 +93,7 @@ using WarpGemmMfmaDispatcher = typename impl::WarpGemmMfmaDispatcher<AType,
                                                                      NPerWave,
                                                                      KPerWave,
                                                                      TransposeC,
-                                                                     SwizzleA>::Type;
+                                                                     SwizzleA,
+                                                                     UseStructuredSparsity>::Type;
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_smfmac_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_smfmac_impl.hpp
new file mode 100644
index 0000000000..9e028ddab0
--- /dev/null
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_smfmac_impl.hpp
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+namespace ck_tile {
+
+template <typename WarpGemmAttribute_>
+struct WarpGemmSmfmacImpl
+{
+    using WarpGemmAttribute = remove_cvref_t<WarpGemmAttribute_>;
+
+    static constexpr index_t kM = WarpGemmAttribute::kM;
+    static constexpr index_t kN = WarpGemmAttribute::kN;
+    static constexpr index_t kK = WarpGemmAttribute::kK;
+    /// @brief The number of elements in K dimension processed by single thread in wavefront.
+    ///
+    /// @note  Note that WarpGemm may run MFMA instruction multiple times (on different K).
+    ///        In such situation this value reflects this fact.
+    static constexpr index_t kKPerThread = WarpGemmAttribute::kKPerThread;
+
+    using ADataType = typename WarpGemmAttribute::ADataType;
+    using BDataType = typename WarpGemmAttribute::BDataType;
+    using CDataType = typename WarpGemmAttribute::CDataType;
+
+    using AWarpDstrEncoding = typename WarpGemmAttribute::AWarpDstrEncoding;
+    using BWarpDstrEncoding = typename WarpGemmAttribute::BWarpDstrEncoding;
+    using CWarpDstrEncoding = typename WarpGemmAttribute::CWarpDstrEncoding;
+
+    using AWarpDstr = remove_cvref_t<decltype(make_static_tile_distribution(AWarpDstrEncoding{}))>;
+    using BWarpDstr = remove_cvref_t<decltype(make_static_tile_distribution(BWarpDstrEncoding{}))>;
+    using CWarpDstr = remove_cvref_t<decltype(make_static_tile_distribution(CWarpDstrEncoding{}))>;
+
+    using AWarpTensor = static_distributed_tensor<ADataType, AWarpDstr>;
+    using BWarpTensor = static_distributed_tensor<BDataType, BWarpDstr>;
+    using CWarpTensor = static_distributed_tensor<CDataType, CWarpDstr>;
+
+    CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access()
+    {
+        return WarpGemmAttribute_::get_num_of_access();
+    }
+
+    //----------------------------------------------------------------------------------------------
+    /// @brief      Compress A vector for 2:4 structured sparsity instruction by moving all non-zero
+    ///             elements into lower part of a_vec to half its effective size.
+    ///
+    /// @param      a_vec  Vector to be compressed.
+    ///
+    /// @return     Four 2-bit indexes of non-zero elements locations
+    ///
+    template <typename AVec>
+    CK_TILE_DEVICE int32_t compress_a(AVec& a_vec) const
+    {
+        int32_t idx = 0b11101110;
+
+        static_for<0, 2, 1>{}([&](auto i) {
+            ADataType nonzero_elems[2] = {a_vec[i * 4 + 2], a_vec[i * 4 + 3]};
+            int32_t non_zero_pos       = 0;
+
+            static_for<0, 3, 1>{}([&](auto j) {
+                if(a_vec[i * 4 + j] != 0.0f)
+                {
+                    nonzero_elems[non_zero_pos] = a_vec[i * 4 + j];
+                    idx &= ~(0b11 << 2 * (i * 2 + non_zero_pos));
+                    idx |= j << 2 * (i * 2 + non_zero_pos);
+                    ++non_zero_pos;
+                }
+            });
+            a_vec[i * 2]     = nonzero_elems[0];
+            a_vec[i * 2 + 1] = nonzero_elems[1];
+        });
+
+        return idx;
+    }
+
+    template <typename CTensor, typename ATensor, typename BTensor, bool post_nop_ = false>
+    CK_TILE_DEVICE void
+    operator()(CTensor& c, const ATensor& a, const BTensor& b, bool_constant<post_nop_> = {}) const
+    {
+        static_assert(detail::is_similiar_distributed_tensor_v<CTensor, CWarpTensor> &&
+                      detail::is_similiar_distributed_tensor_v<ATensor, AWarpTensor> &&
+                      detail::is_similiar_distributed_tensor_v<BTensor, BWarpTensor>);
+        constexpr auto CompressionRatio = WarpGemmAttribute::kCompressionRatio;
+
+        using AVec = ext_vector_t<ADataType, ATensor::get_thread_buffer_size()>;
+        using AVecCompressed =
+            ext_vector_t<ADataType, ATensor::get_thread_buffer_size() / CompressionRatio>;
+        using BVec = ext_vector_t<BDataType, BTensor::get_thread_buffer_size()>;
+        using CVec = ext_vector_t<CDataType, CTensor::get_thread_buffer_size()>;
+
+        constexpr auto I0 = number<0>{};
+
+        auto a_vec       = a.get_thread_buffer().template get_as<AVec>()[I0];
+        const auto b_vec = b.get_thread_buffer().template get_as<BVec>()[I0];
+        auto c_vec       = c.get_thread_buffer().template get_as<CVec>()[I0];
+
+        const int32_t idx = compress_a(a_vec);
+
+        // @TODO can we simply set a_vec_pruned to a_vec[0:3]?
+        const AVecCompressed a_vec_pruned = {a_vec[0], a_vec[1], a_vec[2], a_vec[3]};
+
+        // c_vec += a_vec * b_vec[idx]
+        WarpGemmAttribute{}(c_vec, a_vec_pruned, b_vec, idx, bool_constant<post_nop_>{});
+
+        c.get_thread_buffer().template set_as<CVec>(I0, c_vec);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
index b0b0c194ad..73cdd084c6 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
@@ -182,9 +182,16 @@ struct Layernorm2dFwdPipelineTwoPass
         ck_tile::index_t stride_to_right_most_window =
             row_size % Block_N == 0 ? row_size - Block_N : row_size - row_size % Block_N;
 
-        move_tile_window(x_window, {0, -Block_N});
-        move_tile_window(x_residual_window, {0, -Block_N});
-        move_tile_window(x_bias_window, {-Block_N});
+        if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE)
+        {
+            move_tile_window(y_residual_window, {0, -Block_N});
+        }
+        else
+        {
+            move_tile_window(x_window, {0, -Block_N});
+            move_tile_window(x_residual_window, {0, -Block_N});
+            move_tile_window(x_bias_window, {-Block_N});
+        }
         move_tile_window(gamma_window, {stride_to_right_most_window});
         move_tile_window(beta_window, {stride_to_right_most_window});
         move_tile_window(y_window, {0, stride_to_right_most_window});
@@ -192,28 +199,43 @@ struct Layernorm2dFwdPipelineTwoPass
         // layernorm computation
         for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
         {
-            auto x            = load_tile(x_window);
-            auto x_resi       = load_tile(x_residual_window);
-            const auto x_bias = load_tile(x_bias_window);
-            auto acc          = cast_tile<ComputeDataType>(x);
+            auto acc = make_static_distributed_tensor<ComputeDataType>(
+                decltype(load_tile(x_window))::get_tile_distribution());
 
-            if constexpr(kXbias == Layernorm2dXBiasEnum::ADD_BIAS)
+            if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE)
             {
-                sweep_tile(x, [&](auto idx) {
-                    // compute x = bias + x
-                    constexpr auto j_idx = make_tuple(idx[number<1>{}]);
-                    acc(idx)             = type_convert<ComputeDataType>(x_bias[j_idx]) + acc(idx);
-                });
+                acc = cast_tile<ComputeDataType>(load_tile(y_residual_window));
+                move_tile_window(y_residual_window, {0, -Block_N});
+            }
+            else
+            {
+                acc = cast_tile<ComputeDataType>(load_tile(x_window));
+                move_tile_window(x_window, {0, -Block_N});
+
+                if constexpr(kXbias == Layernorm2dXBiasEnum::ADD_BIAS)
+                {
+                    const auto x_bias = load_tile(x_bias_window);
+                    move_tile_window(x_bias_window, {-Block_N});
+
+                    sweep_tile(acc, [&](auto idx) {
+                        // compute x = bias + x
+                        constexpr auto j_idx = make_tuple(idx[number<1>{}]);
+                        acc(idx) = type_convert<ComputeDataType>(x_bias[j_idx]) + acc(idx);
+                    });
+                }
+
+                if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD)
+                {
+                    auto x_resi = load_tile(x_residual_window);
+                    move_tile_window(x_residual_window, {0, -Block_N});
+
+                    sweep_tile(x_resi, [&](auto idx) {
+                        // compute x = x_resi + x
+                        acc(idx) = type_convert<ComputeDataType>(x_resi(idx)) + acc(idx);
+                    });
+                }
             }
 
-            if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE ||
-                         kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD)
-            {
-                sweep_tile(x_resi, [&](auto idx) {
-                    // compute x = x_resi + x
-                    acc(idx) = type_convert<ComputeDataType>(x_resi(idx)) + acc(idx);
-                });
-            }
             // load gamma/beta (TODO: support no gamma/beta?)
             const auto gamma = load_tile(gamma_window);
             const auto beta  = load_tile(beta_window);
@@ -235,9 +257,6 @@ struct Layernorm2dFwdPipelineTwoPass
             static_assert(kFusedQuant != Layernorm2dFusedQuantEnum::DYNAMIC_QUANT);
             Epilogue{}(y_window, ln);
 
-            move_tile_window(x_window, {0, -Block_N});
-            move_tile_window(x_residual_window, {0, -Block_N});
-            move_tile_window(x_bias_window, {-Block_N});
             move_tile_window(gamma_window, {-Block_N});
             move_tile_window(beta_window, {-Block_N});
             move_tile_window(y_window, {0, -Block_N});
diff --git a/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp
index 88c8084de6..f0251177d4 100644
--- a/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp
+++ b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp
@@ -21,6 +21,7 @@ struct Rmsnorm2dFwdHostArgs
     void* p_y_residual; // [m, n], shortcut output, prec same as input, nullptr if not used
     void* p_y_scale;    // [m, 1], output a dynamic quant per row, nullptr if not used
     void* p_invRms;     // [m, 1], output inv-rms, prec same as input, nullptr if not used
+    void* p_y_unquant;  // [m, n], output result before quant, nullptr if not used
 
     float epsilon;
 
@@ -47,13 +48,15 @@ struct Rmsnorm2dFwd
     using InvRmsDataType      = remove_cvref_t<typename Problem::InvRmsDataType>;
     using SmoothScaleDataType = remove_cvref_t<typename Problem::SmoothScaleDataType>;
     using YScaleDataType      = remove_cvref_t<typename Problem::YScaleDataType>;
+    using UnquantYDataType    = remove_cvref_t<typename Problem::UnquantYDataType>;
 
     // for simplicity, shortcut input/output type is same as X
     using XResidualDataType = XDataType;
     using YResidualDataType = XDataType;
 
-    static constexpr bool kHasGamma   = !std::is_same_v<GammaDataType, null_type>;
-    static constexpr bool kSaveInvRms = Problem::Traits::kSaveInvRms;
+    static constexpr bool kHasGamma    = !std::is_same_v<GammaDataType, null_type>;
+    static constexpr bool kSaveInvRms  = Problem::Traits::kSaveInvRms;
+    static constexpr bool kSaveUnquant = Problem::Traits::kSaveUnquant;
 
     static constexpr index_t Block_M  = Problem::BlockShape::Block_M;
     static constexpr index_t Block_N  = Problem::BlockShape::Block_N;
@@ -81,6 +84,7 @@ struct Rmsnorm2dFwd
         void* p_y_residual;
         void* p_y_scale;
         void* p_invRms;
+        void* p_y_unquant;
 
         float epsilon;
 
@@ -103,6 +107,7 @@ struct Rmsnorm2dFwd
                      hargs.p_y_residual,
                      hargs.p_y_scale,
                      hargs.p_invRms,
+                     hargs.p_y_unquant,
                      hargs.epsilon,
                      hargs.m,
                      hargs.n,
@@ -323,6 +328,30 @@ struct Rmsnorm2dFwd
             }
         }();
 
+        auto unquant_y_window = [&]() {
+            if constexpr((kFusedQuant == Rmsnorm2dFusedQuantEnum::SMOOTH_DYNAMIC_QUANT ||
+                          kFusedQuant == Rmsnorm2dFusedQuantEnum::DYNAMIC_QUANT) &&
+                         kSaveUnquant)
+            {
+                auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                    static_cast<UnquantYDataType*>(kargs.p_y_unquant),
+                    make_tuple(kargs.m, kargs.n),
+                    make_tuple(kargs.y_stride, 1),
+                    number<Vector_N>{},
+                    number<1>{});
+
+                auto tmp2_ = pad_tensor_view(tmp_,
+                                             make_tuple(number<Block_M>{}, number<Block_N>{}),
+                                             sequence<kPadM, kPadN>{});
+                return make_tile_window(
+                    tmp2_, make_tuple(number<Block_M>{}, number<Block_N>{}), {iM, 0});
+            }
+            else
+            {
+                return make_null_tile_window(make_tuple(number<Block_M>{}, number<Block_N>{}));
+            }
+        }();
+
         __shared__ char smem[GetSmemSize()];
 
         Pipeline{}(x_window,
@@ -333,6 +362,7 @@ struct Rmsnorm2dFwd
                    inv_rms_window,
                    sm_scale_window,
                    y_scale_window,
+                   unquant_y_window,
                    static_cast<const ComputeDataType>(kargs.epsilon),
                    kargs.n,
                    smem,
diff --git a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp
index 93c2833be4..58159142d0 100644
--- a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp
+++ b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp
@@ -25,8 +25,9 @@ struct Rmsnorm2dFwdPipelineOnePass
     using XResidualDataType = XDataType;
     using YResidualDataType = XDataType;
 
-    static constexpr bool kHasGamma   = !std::is_same_v<GammaDataType, ck_tile::null_type>;
-    static constexpr bool kSaveInvRms = Problem::Traits::kSaveInvRms;
+    static constexpr bool kHasGamma    = !std::is_same_v<GammaDataType, ck_tile::null_type>;
+    static constexpr bool kSaveInvRms  = Problem::Traits::kSaveInvRms;
+    static constexpr bool kSaveUnquant = Problem::Traits::kSaveUnquant;
 
     static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
     static constexpr bool kPadM              = false; // TODO - BlockRmsnorm2dFwdProblem::kPadM
@@ -54,6 +55,7 @@ struct Rmsnorm2dFwdPipelineOnePass
               typename InvRmsWindow,
               typename SmoothScaleWindow,
               typename YScaleWindow,
+              typename UnquantYWindow,
               typename Epilogue>
     CK_TILE_DEVICE auto operator()(const XWindow& x_window_,
                                    const XResidualWindow& x_residual_window_,
@@ -63,6 +65,7 @@ struct Rmsnorm2dFwdPipelineOnePass
                                    InvRmsWindow& inv_rms_window,
                                    const SmoothScaleWindow& sm_scale_window_,
                                    YScaleWindow& y_scale_window_,
+                                   UnquantYWindow& unquant_y_window,
                                    ComputeDataType epsilon,
                                    ck_tile::index_t row_size,
                                    void* smem,
@@ -137,11 +140,26 @@ struct Rmsnorm2dFwdPipelineOnePass
 
         if constexpr(kFusedQuant == Rmsnorm2dFusedQuantEnum::SMOOTH_DYNAMIC_QUANT)
         {
-            Epilogue{}(y_window_, sm_scale_window_, y_scale_window_, rmsn, smem);
+            if constexpr(kSaveUnquant)
+            {
+                Epilogue{}(
+                    unquant_y_window, y_window_, sm_scale_window_, y_scale_window_, rmsn, smem);
+            }
+            else
+            {
+                Epilogue{}(y_window_, sm_scale_window_, y_scale_window_, rmsn, smem);
+            }
         }
         else if constexpr(kFusedQuant == Rmsnorm2dFusedQuantEnum::DYNAMIC_QUANT)
         {
-            Epilogue{}(y_window_, y_scale_window_, rmsn, smem);
+            if constexpr(kSaveUnquant)
+            {
+                Epilogue{}(unquant_y_window, y_window_, y_scale_window_, rmsn, smem);
+            }
+            else
+            {
+                Epilogue{}(y_window_, y_scale_window_, rmsn, smem);
+            }
         }
         else
         {
diff --git a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp
index baf56246f3..773df4f0f4 100644
--- a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp
+++ b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp
@@ -12,6 +12,7 @@ template <typename XDataType_,
           typename ComputeDataType_,
           typename YDataType_,
           typename InvRmsDataType_,
+          typename UnquantYDataType_,
           typename SmoothScaleDataType_,
           typename YScaleDataType_,
           typename BlockShape_,
@@ -23,6 +24,7 @@ struct Rmsnorm2dFwdPipelineProblem
     using ComputeDataType     = remove_cvref_t<ComputeDataType_>;
     using YDataType           = remove_cvref_t<YDataType_>;
     using InvRmsDataType      = remove_cvref_t<InvRmsDataType_>;
+    using UnquantYDataType    = remove_cvref_t<UnquantYDataType_>;
     using SmoothScaleDataType = remove_cvref_t<SmoothScaleDataType_>;
     using YScaleDataType      = remove_cvref_t<YScaleDataType_>;
     using BlockShape          = remove_cvref_t<BlockShape_>;
diff --git a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp
index c29a6cb07d..4ca1dbc5da 100644
--- a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp
+++ b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp
@@ -54,6 +54,7 @@ struct Rmsnorm2dFwdPipelineTwoPass
               typename InvRmsWindow,
               typename SmoothScaleWindow,
               typename YScaleWindow,
+              typename UnquantYWindow,
               typename Epilogue>
     CK_TILE_DEVICE auto operator()(const XWindow& x_window_,
                                    const XResidualWindow& x_residual_window_,
@@ -63,6 +64,7 @@ struct Rmsnorm2dFwdPipelineTwoPass
                                    InvRmsWindow& inv_rms_window,
                                    const SmoothScaleWindow& /*sm_scale_window_*/,
                                    YScaleWindow& /*y_scale_window*/,
+                                   UnquantYWindow& /*unquant_y_window*/,
                                    ComputeDataType epsilon,
                                    ck_tile::index_t row_size,
                                    void* smem,
@@ -136,32 +138,51 @@ struct Rmsnorm2dFwdPipelineTwoPass
         ck_tile::index_t stride_to_right_most_window =
             row_size % Block_N == 0 ? row_size - Block_N : row_size - row_size % Block_N;
 
-        move_tile_window(x_window, {0, -Block_N});
-        move_tile_window(x_residual_window, {0, -Block_N});
+        if constexpr(kFusedAdd == Rmsnorm2dFusedAddEnum::PRE_ADD_STORE)
+        {
+            move_tile_window(y_residual_window, {0, -Block_N});
+        }
+        else
+        {
+            move_tile_window(x_window, {0, -Block_N});
+            move_tile_window(x_residual_window, {0, -Block_N});
+        }
         move_tile_window(gamma_window, {stride_to_right_most_window});
         move_tile_window(y_window, {0, stride_to_right_most_window});
 
         // rmsnorm computation
         for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
         {
-            auto x      = load_tile(x_window);
-            auto x_resi = load_tile(x_residual_window);
-            auto acc    = cast_tile<ComputeDataType>(x);
+            auto acc = make_static_distributed_tensor<ComputeDataType>(
+                decltype(load_tile(x_window))::get_tile_distribution());
 
-            if constexpr(kFusedAdd == Rmsnorm2dFusedAddEnum::PRE_ADD_STORE ||
-                         kFusedAdd == Rmsnorm2dFusedAddEnum::PRE_ADD)
+            if constexpr(kFusedAdd == Rmsnorm2dFusedAddEnum::PRE_ADD_STORE)
             {
-                sweep_tile(x_resi, [&](auto idx) {
-                    // compute x = x_resi + x
-                    acc(idx) = type_convert<ComputeDataType>(x_resi(idx)) + acc(idx);
-                });
+                acc = cast_tile<ComputeDataType>(load_tile(y_residual_window));
+                move_tile_window(y_residual_window, {0, -Block_N});
+            }
+            else
+            {
+                acc = cast_tile<ComputeDataType>(load_tile(x_window));
+                move_tile_window(x_window, {0, -Block_N});
+
+                if constexpr(kFusedAdd == Rmsnorm2dFusedAddEnum::PRE_ADD)
+                {
+                    auto x_resi = load_tile(x_residual_window);
+                    sweep_tile(x_resi, [&](auto idx) {
+                        // compute x = x_resi + x
+                        acc(idx) = type_convert<ComputeDataType>(x_resi(idx)) + acc(idx);
+                    });
+                    move_tile_window(x_residual_window, {0, -Block_N});
+                }
             }
 
             // load gamma (TODO: support no gamma?)
             const auto gamma = load_tile(gamma_window);
 
             // rmsnorm computation
-            auto rmsn = make_static_distributed_tensor<ComputeDataType>(x.get_tile_distribution());
+            auto rmsn = make_static_distributed_tensor<ComputeDataType>(
+                decltype(load_tile(x_window))::get_tile_distribution());
             sweep_tile(rmsn, [&, inv_rms_ = inv_rms](auto idx) {
                 constexpr auto i_idx = make_tuple(idx[number<0>{}]);
                 constexpr auto j_idx = make_tuple(idx[number<1>{}]);
@@ -176,8 +197,6 @@ struct Rmsnorm2dFwdPipelineTwoPass
             static_assert(kFusedQuant == Rmsnorm2dFusedQuantEnum::NO_SWEEP);
             Epilogue{}(y_window, rmsn);
 
-            move_tile_window(x_window, {0, -Block_N});
-            move_tile_window(x_residual_window, {0, -Block_N});
             move_tile_window(gamma_window, {-Block_N});
             move_tile_window(y_window, {0, -Block_N});
         }
diff --git a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_traits.hpp b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_traits.hpp
index cb7beba291..152da60c01 100644
--- a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_traits.hpp
+++ b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_traits.hpp
@@ -39,6 +39,7 @@ template<> struct Rmsnorm2dFusedQuantEnumName<Rmsnorm2dFusedQuantEnum::SMOOTH_DY
 
 template <bool kPadN_,
           bool kSaveInvRms_,
+          bool kSaveUnquant_,
           bool kTwoPass_,
           Rmsnorm2dFusedAddEnum kFusedAdd_,
           Rmsnorm2dFusedQuantEnum kFusedQuant_>
@@ -46,6 +47,7 @@ struct Rmsnorm2dFwdTraits
 {
     static constexpr bool kPadN                          = kPadN_;
     static constexpr bool kSaveInvRms                    = kSaveInvRms_;
+    static constexpr bool kSaveUnquant                   = kSaveUnquant_;
     static constexpr bool kTwoPass                       = kTwoPass_;
     static constexpr Rmsnorm2dFusedAddEnum kFusedAdd     = kFusedAdd_;
     static constexpr Rmsnorm2dFusedQuantEnum kFusedQuant = kFusedQuant_;
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp
new file mode 100644
index 0000000000..120bf7484a
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp
@@ -0,0 +1,295 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <unordered_map>
+
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename D2DataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          index_t ActivationType_ = 0,
+          bool MulRoutedWeight    = true,
+          typename ComputeTypeA   = CDataType,
+          typename ComputeTypeB   = ComputeTypeA>
+struct ReferenceMoeGemm : public device::BaseOperator
+{
+    // Argument
+    static constexpr auto ActivationType = ActivationType_;
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<ck::index_t>& sorted_token_ids,
+                 const Tensor<ck::index_t>& expert_ids,
+                 const Tensor<ck::index_t>& max_token_id,
+                 const index_t sorted_tile_size,
+                 const Tensor<ADataType>& a_t_k,
+                 const Tensor<float>& a_scale_t,
+                 const Tensor<BDataType>& b_e_n_k,
+                 const Tensor<float>& b_scale_e_n,
+                 Tensor<CDataType>& c_t_k_n,
+                 const Tensor<D2DataType>& d2,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+            : sorted_token_ids_{sorted_token_ids},
+              expert_ids_{expert_ids},
+              max_token_id_{max_token_id},
+              sorted_tile_size_{sorted_tile_size},
+              a_t_k_{a_t_k},
+              a_scale_t_{a_scale_t},
+              b_e_n_k_{b_e_n_k},
+              b_scale_e_n_{b_scale_e_n},
+              c_t_k_n_{c_t_k_n},
+              d2_{d2},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op}
+        {
+        }
+
+        const Tensor<ck::index_t>& sorted_token_ids_;
+        const Tensor<ck::index_t>& expert_ids_;
+        const Tensor<ck::index_t>& max_token_id_;
+        index_t sorted_tile_size_;
+        const Tensor<ADataType>& a_t_k_;
+        const Tensor<float>& a_scale_t_;
+        const Tensor<BDataType>& b_e_n_k_;
+        const Tensor<float>& b_scale_e_n_;
+        Tensor<CDataType>& c_t_k_n_;
+        const Tensor<D2DataType>& d2_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceMoeGemm::Argument;
+
+        float Run(const Argument& arg)
+        {
+            static_assert(ActivationType < 2, "Not supported activation type");
+            const int full_n = arg.c_t_k_n_.mDesc.GetLengths()[2];
+            auto f_mk_kn_mn  = [&](auto m, auto n) {
+                const int K = arg.a_t_k_.mDesc.GetLengths()[1];
+                AccDataType v_acc_up{0};
+                ComputeTypeB v_b_up{0};
+                AccDataType v_acc{0};
+
+                ComputeTypeA v_a{0};
+                ComputeTypeB v_b{0};
+
+                const int t         = arg.sorted_token_ids_(m) & 0xffffff;
+                const int topk_id   = (arg.sorted_token_ids_(m) & 0xff000000) >> 24;
+                const int e         = arg.expert_ids_(m / arg.sorted_tile_size_);
+                const int token_cnt = arg.a_t_k_.mDesc.GetLengths()[0];
+                D2DataType v_topk_w = arg.d2_(m, 0); // expert
+                if(t < token_cnt)
+                {
+                    for(int k = 0; k < K; ++k)
+                    {
+                        if constexpr(is_same_v<ADataType, pk_i4_t>)
+                        {
+                            uint8_t i4x2 = arg.a_t_k_(t, k).data;
+                            uint8_t i4   = 0;
+                            if(k % 2 == 1)
+                                i4 = (i4x2 >> 0) & 0xf;
+                            else
+                                i4 = (i4x2 >> 4) & 0xf;
+#if CK_USE_PK4_LAYOUT_SHUFFLE
+                            v_a = i4_to_f32_gfx9(i4);
+#else
+                            v_a    = i4 - 8;
+#endif
+                        }
+                        else
+                        {
+                            arg.a_element_op_(v_a, arg.a_t_k_(t, k));
+                        }
+                        // same for B matrix
+                        if constexpr(is_same_v<BDataType, pk_i4_t>)
+                        {
+                            uint8_t i4x2    = arg.b_e_n_k_(e, k, n).data;
+                            uint8_t i4x2_up = arg.b_e_n_k_(e, k, n + full_n).data;
+                            uint8_t i4      = 0;
+                            uint8_t i4_up   = 0;
+                            if(k % 2 == 1)
+                            {
+                                i4    = (i4x2 >> 0) & 0xf;
+                                i4_up = (i4x2_up >> 0) & 0xf;
+                            }
+                            else
+                            {
+                                i4    = (i4x2 >> 4) & 0xf;
+                                i4_up = (i4x2_up >> 4) & 0xf;
+                            }
+#if CK_USE_PK4_LAYOUT_SHUFFLE
+                            v_b    = i4_to_f32_gfx9(i4);
+                            v_b_up = i4_to_f32_gfx9(i4_up);
+#else
+                            v_b    = i4 - 8;
+                            v_b_up = i4_up - 8;
+#endif
+                        }
+                        else
+                        {
+                            arg.b_element_op_(v_b, arg.b_e_n_k_(e, k, n));
+                            arg.b_element_op_(v_b_up, arg.b_e_n_k_(e, k, n + full_n));
+                        }
+
+                        v_acc +=
+                            ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
+                        v_acc_up += ck::type_convert<AccDataType>(v_a) *
+                                    ck::type_convert<AccDataType>(v_b_up);
+                    }
+                    CDataType v_c{0};
+                    CDataType v_c_up{0};
+                    if constexpr(MulRoutedWeight)
+                    {
+                        v_acc *= v_topk_w;
+                        v_acc_up *= v_topk_w;
+                    }
+
+                    arg.c_element_op_(v_c, v_acc);
+                    arg.c_element_op_(v_c_up, v_acc_up);
+
+                    if constexpr(ActivationType == 1)
+                    {
+                        v_c = v_c * arg.b_scale_e_n_(e, n) * arg.a_scale_t_(t);
+                        if constexpr(is_same_v<BDataType, pk_i4_t>)
+                        {
+                            v_c_up *= 16;
+                            v_c *= 16;
+                        }
+                        tensor_operation::element_wise::Silu{}(v_c, v_c);
+                        v_c_up = v_c_up * arg.b_scale_e_n_(e, n + full_n) * arg.a_scale_t_(t);
+                        arg.c_t_k_n_(t, topk_id, n) = v_c * v_c_up;
+                    }
+                    else if constexpr(ActivationType == 0)
+                    {
+                        v_c = v_c * arg.b_scale_e_n_(e, n) * arg.a_scale_t_(t);
+                        if constexpr(is_same_v<BDataType, pk_i4_t>)
+                        {
+                            v_c_up *= 16;
+                            v_c *= 16;
+                        }
+                        tensor_operation::element_wise::Gelu{}(v_c, v_c);
+                        v_c_up = v_c_up * arg.b_scale_e_n_(e, n + full_n) * arg.a_scale_t_(t);
+                        arg.c_t_k_n_(t, topk_id, n) = v_c * v_c_up;
+                    }
+                }
+            };
+
+            const ck::index_t max_token_id = arg.max_token_id_(0);
+            make_ParallelTensorFunctor(f_mk_kn_mn, max_token_id, full_n)(
+                std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<ck::index_t>& sorted_token_ids,
+                             const Tensor<ck::index_t>& expert_ids,
+                             const Tensor<ck::index_t>& max_token_id,
+                             const index_t sorted_tile_size,
+                             const Tensor<ADataType>& a_t_k,
+                             const Tensor<float>& a_scale_n,
+                             const Tensor<BDataType>& b_e_n_k,
+                             const Tensor<float>& b_scale_e_n,
+                             Tensor<CDataType>& c_t_k_n,
+                             const Tensor<D2DataType>& d2,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{sorted_token_ids,
+                        expert_ids,
+                        max_token_id,
+                        sorted_tile_size,
+                        a_t_k,
+                        a_scale_n,
+                        b_e_n_k,
+                        b_scale_e_n,
+                        c_t_k_n,
+                        d2,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceMoeGemm"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+
+    static float i4_to_f32_gfx9(uint8_t i4)
+    {
+        static std::unordered_map<uint8_t, float> u = {{0b1000, -0.5000f},
+                                                       {0b1001, -0.4375f},
+                                                       {0b1010, -0.3750f},
+                                                       {0b1011, -0.3125f},
+                                                       {0b1100, -0.2500f},
+                                                       {0b1101, -0.1875f},
+                                                       {0b1110, -0.1250f},
+                                                       {0b1111, -0.0625f},
+                                                       {0b0, +0.0000f},
+                                                       {0b1, +0.0625f},
+                                                       {0b10, +0.1250f},
+                                                       {0b11, +0.1875f},
+                                                       {0b100, +0.2500f},
+                                                       {0b101, +0.3125f},
+                                                       {0b110, +0.3750f},
+                                                       {0b111, +0.4375f}};
+
+        return u[i4];
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp
new file mode 100644
index 0000000000..5c932fcb18
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <unordered_map>
+
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename D0DataType,
+          typename D1DataType,
+          typename D2DataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          bool MulRoutedWeight  = true,
+          typename ComputeTypeA = CDataType,
+          typename ComputeTypeB = ComputeTypeA>
+struct ReferenceMoeGemm2 : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<ck::index_t>& sorted_token_ids,
+                 const Tensor<ck::index_t>& expert_ids,
+                 const Tensor<ck::index_t>& max_token_id,
+                 const index_t sorted_tile_size,
+                 const Tensor<ADataType>& a_t_k_k,
+                 const Tensor<BDataType>& b_e_n_k,
+                 const Tensor<D0DataType>& d0,
+                 const Tensor<D1DataType>& d1,
+                 const Tensor<D2DataType>& d2,
+                 Tensor<CDataType>& c_t_n,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+            : sorted_token_ids_{sorted_token_ids},
+              expert_ids_{expert_ids},
+              max_token_id_{max_token_id},
+              sorted_tile_size_{sorted_tile_size},
+              a_t_k_k_{a_t_k_k},
+              b_e_n_k_{b_e_n_k},
+              d0_{d0},
+              d1_{d1},
+              d2_{d2},
+              c_t_n_{c_t_n},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op}
+        {
+        }
+
+        const Tensor<ck::index_t>& sorted_token_ids_;
+        const Tensor<ck::index_t>& expert_ids_;
+        const Tensor<ck::index_t>& max_token_id_;
+        index_t sorted_tile_size_;
+        const Tensor<ADataType>& a_t_k_k_;
+        const Tensor<BDataType>& b_e_n_k_;
+        const Tensor<D0DataType>& d0_;
+        const Tensor<D1DataType>& d1_;
+        const Tensor<D2DataType>& d2_;
+        Tensor<CDataType>& c_t_n_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceMoeGemm2::Argument;
+
+        float Run(const Argument& arg)
+        {
+            arg.c_t_n_.SetZero();
+            auto f_mk_kn_mn = [&](auto m, auto n) {
+                const int K = arg.a_t_k_k_.mDesc.GetLengths()[2];
+                AccDataType v_acc{0};
+                ComputeTypeA v_a{0};
+                ComputeTypeB v_b{0};
+                const int t         = arg.sorted_token_ids_(m) & 0xffffff;
+                const int topk_id   = arg.sorted_token_ids_(m) >> 24;
+                const int e         = arg.expert_ids_(m / arg.sorted_tile_size_);
+                const int token_cnt = arg.c_t_n_.mDesc.GetLengths()[0];
+                D2DataType v_topk_w = arg.d2_(m, 0); // expert
+
+                if(t < token_cnt)
+                {
+                    for(int k = 0; k < K; ++k)
+                    {
+                        if constexpr(is_same_v<ADataType, pk_i4_t>)
+                        {
+                            uint8_t i4x2 = arg.a_t_k_(t, topk_id, k).data;
+                            uint8_t i4   = 0;
+                            if(k % 2 == 1)
+                                i4 = (i4x2 >> 0) & 0xf;
+                            else
+                                i4 = (i4x2 >> 4) & 0xf;
+#if CK_USE_PK4_LAYOUT_SHUFFLE
+                            v_a = i4_to_f32_gfx9(i4);
+#else
+                            v_a = i4 - 8;
+#endif
+                        }
+                        else
+                        {
+                            arg.a_element_op_(v_a, arg.a_t_k_k_(t, topk_id, k));
+                        }
+                        if constexpr(is_same_v<BDataType, pk_i4_t>)
+                        {
+                            uint8_t i4x2 = arg.b_e_n_k_(e, k, n).data;
+                            uint8_t i4   = 0;
+                            if(k % 2 == 1)
+                                i4 = (i4x2 >> 0) & 0xf;
+                            else
+                                i4 = (i4x2 >> 4) & 0xf;
+#if CK_USE_PK4_LAYOUT_SHUFFLE
+                            v_b = i4_to_f32_gfx9(i4);
+#else
+                            v_b = i4 - 8;
+#endif
+                        }
+                        else
+                        {
+                            arg.b_element_op_(v_b, arg.b_e_n_k_(e, k, n));
+                        }
+
+                        v_acc +=
+                            ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
+                    }
+                    CDataType v_c{0};
+                    D0DataType v_d0 = arg.d0_(m, n); // a
+                    D0DataType v_d1 = arg.d1_(e, n); // b
+                    if constexpr(MulRoutedWeight)
+                    {
+                        arg.c_element_op_(v_c, v_acc, v_d0, v_d1, v_topk_w);
+                    }
+                    else
+                    {
+                        arg.c_element_op_(v_c, v_acc, v_d0, v_d1, 1.f);
+                    }
+                    arg.c_t_n_(t, n) += v_c;
+                }
+            };
+
+            const ck::index_t max_token_id = arg.max_token_id_(0);
+            make_ParallelTensorFunctor(f_mk_kn_mn, max_token_id, arg.c_t_n_.mDesc.GetLengths()[1])(
+                std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<ck::index_t>& sorted_token_ids,
+                             const Tensor<ck::index_t>& expert_ids,
+                             const Tensor<ck::index_t>& max_token_id,
+                             const index_t sorted_tile_size,
+                             const Tensor<ADataType>& a_t_k_k,
+                             const Tensor<BDataType>& b_e_n_k,
+                             const Tensor<D0DataType>& d0,
+                             const Tensor<D1DataType>& d1,
+                             const Tensor<D2DataType>& d2,
+                             Tensor<CDataType>& c_t_n,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{sorted_token_ids,
+                        expert_ids,
+                        max_token_id,
+                        sorted_tile_size,
+                        a_t_k_k,
+                        b_e_n_k,
+                        d0,
+                        d1,
+                        d2,
+                        c_t_n,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceMoeGemm2"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+
+#if CK_USE_PK4_LAYOUT_SHUFFLE
+    static float i4_to_f32_gfx9(uint8_t i4)
+    {
+        static std::unordered_map<uint8_t, float> u = {{0b1000, -0.5000f},
+                                                       {0b1001, -0.4375f},
+                                                       {0b1010, -0.3750f},
+                                                       {0b1011, -0.3125f},
+                                                       {0b1100, -0.2500f},
+                                                       {0b1101, -0.1875f},
+                                                       {0b1110, -0.1250f},
+                                                       {0b1111, -0.0625f},
+                                                       {0b0, +0.0000f},
+                                                       {0b1, +0.0625f},
+                                                       {0b10, +0.1250f},
+                                                       {0b11, +0.1875f},
+                                                       {0b100, +0.2500f},
+                                                       {0b101, +0.3125f},
+                                                       {0b110, +0.3750f},
+                                                       { 0b111,
+                                                         +0.4375f }};
+
+        return u[i4];
+    }
+#endif
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
index 4a44c425aa..c3fd04ba35 100644
--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -71,6 +71,10 @@ using GKXC   = ck::tensor_layout::convolution::GKXC;
 using GKYXC  = ck::tensor_layout::convolution::GKYXC;
 using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
 
+using GKCX   = ck::tensor_layout::convolution::GKCX;
+using GKCYX  = ck::tensor_layout::convolution::GKCYX;
+using GKCZYX = ck::tensor_layout::convolution::GKCZYX;
+
 using GNWK   = ck::tensor_layout::convolution::GNWK;
 using GNHWK  = ck::tensor_layout::convolution::GNHWK;
 using GNDHWK = ck::tensor_layout::convolution::GNDHWK;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_ab_scale.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_ab_scale.hpp
index 7553d5e76e..3fa82ae53a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_ab_scale.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_ab_scale.hpp
@@ -17,7 +17,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 #if(defined(CK_ENABLE_BF16) || defined(CK_ENABLE_FP8))
-void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instances(
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_default_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
                                                             Col,
                                                             Tuple<>,
@@ -28,14 +28,14 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_i
                                                             F32,
                                                             Tuple<>,
                                                             BF16,
-                                                            128,
+                                                            1,
                                                             128,
                                                             128,
                                                             PassThrough,
                                                             PassThrough,
                                                             PassThrough>>>& instances);
 
-void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instances(
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_kpadding_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
                                                             Col,
                                                             Tuple<>,
@@ -46,14 +46,14 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_
                                                             F32,
                                                             Tuple<>,
                                                             BF16,
-                                                            128,
+                                                            1,
                                                             128,
                                                             128,
                                                             PassThrough,
                                                             PassThrough,
                                                             PassThrough>>>& instances);
 
-void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnpadding_instances(
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_v1_default_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
                                                             Col,
                                                             Tuple<>,
@@ -64,14 +64,14 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnpadding
                                                             F32,
                                                             Tuple<>,
                                                             BF16,
-                                                            128,
+                                                            1,
                                                             128,
                                                             128,
                                                             PassThrough,
                                                             PassThrough,
                                                             PassThrough>>>& instances);
 
-void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnkpadding_instances(
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_v1_kpadding_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
                                                             Col,
                                                             Tuple<>,
@@ -82,61 +82,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnkpaddin
                                                             F32,
                                                             Tuple<>,
                                                             BF16,
-                                                            128,
-                                                            128,
-                                                            128,
-                                                            PassThrough,
-                                                            PassThrough,
-                                                            PassThrough>>>& instances);
-
-void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
-                                                            Col,
-                                                            Tuple<>,
-                                                            Row,
-                                                            F8,
-                                                            F32,
-                                                            F8,
-                                                            F32,
-                                                            Tuple<>,
-                                                            BF16,
-                                                            128,
-                                                            128,
-                                                            128,
-                                                            PassThrough,
-                                                            PassThrough,
-                                                            PassThrough>>>& instances);
-
-void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
-                                                            Col,
-                                                            Tuple<>,
-                                                            Row,
-                                                            F8,
-                                                            F32,
-                                                            F8,
-                                                            F32,
-                                                            Tuple<>,
-                                                            BF16,
-                                                            128,
-                                                            128,
-                                                            128,
-                                                            PassThrough,
-                                                            PassThrough,
-                                                            PassThrough>>>& instances);
-
-void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_mnkpadding_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
-                                                            Col,
-                                                            Tuple<>,
-                                                            Row,
-                                                            F8,
-                                                            F32,
-                                                            F8,
-                                                            F32,
-                                                            Tuple<>,
-                                                            BF16,
-                                                            128,
+                                                            1,
                                                             128,
                                                             128,
                                                             PassThrough,
@@ -163,7 +109,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
     B1DataType,
     Tuple<>,
     CDataType,
-    128,
+    1,
     128,
     128,
     ck::tensor_operation::element_wise::PassThrough,
@@ -180,7 +126,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
                                                  B1DataType,
                                                  Tuple<>,
                                                  CDataType,
-                                                 128,
+                                                 1,
                                                  128,
                                                  128,
                                                  ck::tensor_operation::element_wise::PassThrough,
@@ -198,20 +144,14 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
             if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                          is_same_v<CLayout, Row>)
             {
-                add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instances(
+                add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_default_instances(
                     op_ptrs);
-                add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instances(
-                    op_ptrs);
-                add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnpadding_instances(
-                    op_ptrs);
-                add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnkpadding_instances(
+                add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_kpadding_instances(
                     op_ptrs);
 
-                add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instances(
+                add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_v1_default_instances(
                     op_ptrs);
-                add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instances(
-                    op_ptrs);
-                add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_mnkpadding_instances(
+                add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_v1_kpadding_instances(
                     op_ptrs);
             }
         }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply.hpp
index 58f8ed3c3c..6475b801b8 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply.hpp
@@ -122,6 +122,32 @@ void add_device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_kp
                                                           PassThrough,
                                                           MultiplyMultiply>>>& instances);
 
+void add_device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_default_instances_part3(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Tuple<Row, Col>,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          Tuple<F32, F32>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances);
+
+void add_device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_kpadding_instances_part3(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Tuple<Row, Col>,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          Tuple<F32, F32>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances);
+
 void add_device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_mem_v1_default_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
                                                           Col,
@@ -360,6 +386,32 @@ void add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_kpa
                                                           PassThrough,
                                                           MultiplyMultiply>>>& instances);
 
+void add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_default_instances_part3(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Tuple<Row, Col>,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          Tuple<F32, F32>,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances);
+
+void add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_kpadding_instances_part3(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Tuple<Row, Col>,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          Tuple<F32, F32>,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances);
+
 void add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_mem_v1_default_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
                                                           Col,
@@ -556,6 +608,10 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
                     op_ptrs);
                 add_device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_kpadding_instances_part2(
                     op_ptrs);
+                add_device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_default_instances_part3(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_kpadding_instances_part3(
+                    op_ptrs);
 
                 add_device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_mem_v1_default_instances(
                     op_ptrs);
@@ -593,6 +649,10 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
                     op_ptrs);
                 add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_kpadding_instances_part2(
                     op_ptrs);
+                add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_default_instances_part3(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_kpadding_instances_part3(
+                    op_ptrs);
 
                 add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_mem_v1_default_instances(
                     op_ptrs);
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle.hpp
deleted file mode 100644
index 90da3ad0fe..0000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle.hpp
+++ /dev/null
@@ -1,317 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <vector>
-#include <memory>
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_b_preshuffle.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-#if(defined(CK_ENABLE_F16) || defined(CK_ENABLE_FP8))
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances_v2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances_v2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances_v2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-#endif
-
-#if(defined(CK_ENABLE_BF16) || defined(CK_ENABLE_FP8))
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instances_v2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instances_v2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instances_v2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-#endif
-
-template <typename ADataType,
-          typename BDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout>
-struct DeviceOperationInstanceFactory<
-    ck::tensor_operation::device::DeviceGemmMultipleDSplitKBPreShuffle<
-        ALayout,
-        BLayout,
-        Tuple<Row, Col>,
-        CLayout,
-        ADataType,
-        BDataType,
-        Tuple<F32, F32>,
-        CDataType,
-        ck::tensor_operation::element_wise::PassThrough,
-        ck::tensor_operation::element_wise::PassThrough,
-        ck::tensor_operation::element_wise::MultiplyMultiply>>
-{
-    using DeviceOp =
-        DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
-                                             BLayout,
-                                             Tuple<Row, Col>,
-                                             CLayout,
-                                             ADataType,
-                                             BDataType,
-                                             Tuple<F32, F32>,
-                                             CDataType,
-                                             ck::tensor_operation::element_wise::PassThrough,
-                                             ck::tensor_operation::element_wise::PassThrough,
-                                             ck::tensor_operation::element_wise::MultiplyMultiply>;
-
-    static auto GetInstances()
-    {
-        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
-// TODO: Add MFMA layout into tensor layout
-#if(defined(CK_ENABLE_F16) || defined(CK_ENABLE_FP8))
-        if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, f8_t> &&
-                     is_same_v<CDataType, half_t>)
-        {
-            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
-                         is_same_v<CLayout, Row>)
-            {
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances(
-                    op_ptrs);
-
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances_v2(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances_v2(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances_v2(
-                    op_ptrs);
-
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances(
-                    op_ptrs);
-            }
-        }
-#endif
-
-#if(defined(CK_ENABLE_BF16) || defined(CK_ENABLE_FP8))
-        if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, f8_t> &&
-                     is_same_v<CDataType, bhalf_t>)
-        {
-            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
-                         is_same_v<CLayout, Row>)
-            {
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instances(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instances(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instances(
-                    op_ptrs);
-
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instances_v2(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instances_v2(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instances_v2(
-                    op_ptrs);
-
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instances(
-                    op_ptrs);
-            }
-        }
-#endif
-        return op_ptrs;
-    }
-};
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply_wp.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply_wp.hpp
new file mode 100644
index 0000000000..07891ea932
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply_wp.hpp
@@ -0,0 +1,664 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <memory>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_b_preshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+#if(defined(CK_ENABLE_F16) || defined(CK_ENABLE_FP8))
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances_v2(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances_v2(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances_v2(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instances_v2(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instances_v2(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p1(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p2(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p1(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p2(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p3(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p4(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p5(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p6(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+#endif
+
+#if(defined(CK_ENABLE_BF16) || defined(CK_ENABLE_FP8))
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instances_v2(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instances_v2(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instances_v2(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instances_v2(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instances_v2(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instances_p1(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instances_p2(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p1(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p2(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p3(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p4(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p5(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p6(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+#endif
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGemmMultipleDSplitKBPreShuffle<
+        ALayout,
+        BLayout,
+        Tuple<Row, Col>,
+        CLayout,
+        ADataType,
+        BDataType,
+        Tuple<F32, F32>,
+        CDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::MultiplyMultiply>>
+{
+    using DeviceOp =
+        DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
+                                             BLayout,
+                                             Tuple<Row, Col>,
+                                             CLayout,
+                                             ADataType,
+                                             BDataType,
+                                             Tuple<F32, F32>,
+                                             CDataType,
+                                             ck::tensor_operation::element_wise::PassThrough,
+                                             ck::tensor_operation::element_wise::PassThrough,
+                                             ck::tensor_operation::element_wise::MultiplyMultiply>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+// TODO: Add MFMA layout into tensor layout
+#if(defined(CK_ENABLE_F16) || defined(CK_ENABLE_FP8))
+        if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, f8_t> &&
+                     is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instances(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instances(
+                    op_ptrs);
+
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances_v2(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances_v2(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances_v2(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instances_v2(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instances_v2(
+                    op_ptrs);
+
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p1(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p2(
+                    op_ptrs);
+
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p1(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p2(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p3(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p4(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p5(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p6(
+                    op_ptrs);
+            }
+        }
+#endif
+
+#if(defined(CK_ENABLE_BF16) || defined(CK_ENABLE_FP8))
+        if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, f8_t> &&
+                     is_same_v<CDataType, bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instances(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instances(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instances(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instances(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instances(
+                    op_ptrs);
+
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instances_v2(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instances_v2(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instances_v2(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instances_v2(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instances_v2(
+                    op_ptrs);
+
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instances_p1(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instances_p2(
+                    op_ptrs);
+
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p1(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p2(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p3(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p4(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p5(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p6(
+                    op_ptrs);
+            }
+        }
+#endif
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_mx.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_mx.hpp
new file mode 100644
index 0000000000..1c40ccec5d
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_mx.hpp
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <memory>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMX<Row,
+                                             Col,
+                                             Row,
+                                             F8,
+                                             e8m0_bexp_t,
+                                             F8,
+                                             e8m0_bexp_t,
+                                             F16,
+                                             32,
+                                             PassThrough,
+                                             PassThrough,
+                                             PassThrough>>>& instances);
+
+void add_device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMX<Row,
+                                             Col,
+                                             Row,
+                                             F8,
+                                             e8m0_bexp_t,
+                                             F8,
+                                             e8m0_bexp_t,
+                                             BF16,
+                                             32,
+                                             PassThrough,
+                                             PassThrough,
+                                             PassThrough>>>& instances);
+
+template <typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename CDataType,
+          index_t ScaleBlockSize,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGemmMX<ALayout,
+                                               BLayout,
+                                               CLayout,
+                                               ADataType,
+                                               AScaleDataType,
+                                               BDataType,
+                                               BScaleDataType,
+                                               CDataType,
+                                               ScaleBlockSize,
+                                               ck::tensor_operation::element_wise::PassThrough,
+                                               ck::tensor_operation::element_wise::PassThrough,
+                                               ck::tensor_operation::element_wise::PassThrough>>
+{
+    using DeviceOp = DeviceGemmMX<ALayout,
+                                  BLayout,
+                                  CLayout,
+                                  ADataType,
+                                  AScaleDataType,
+                                  BDataType,
+                                  BScaleDataType,
+                                  CDataType,
+                                  ScaleBlockSize,
+                                  ck::tensor_operation::element_wise::PassThrough,
+                                  ck::tensor_operation::element_wise::PassThrough,
+                                  ck::tensor_operation::element_wise::PassThrough>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> && is_same_v<CLayout, Row>)
+        {
+            if constexpr(is_same_v<ADataType, F8> && is_same_v<BDataType, F8> &&
+                         is_same_v<CDataType, F16>)
+            {
+
+                add_device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instances(op_ptrs);
+            }
+            if constexpr(is_same_v<ADataType, F8> && is_same_v<BDataType, F8> &&
+                         is_same_v<CDataType, BF16>)
+            {
+
+                add_device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instances(op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal.hpp
index 4218c51ca3..79212e16dd 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -7,521 +7,22 @@
 #include <memory>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
 
+#ifdef CK_USE_WMMA
+#include "gemm_universal_wmma.inc"
+#endif
+#ifdef CK_USE_XDL
+#include "gemm_universal_xdl.inc"
+#endif
+
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-#ifdef CK_ENABLE_FP16
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v1_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v1_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-#endif
-#if(defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP8))
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v1_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v1_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F16, I4, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, BF16, I4, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_mnpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v1_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_comp_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_comp_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_mem_v1_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-#endif
-#ifdef CK_ENABLE_BF16
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_comp_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-#endif
-#if(defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP8))
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_v1_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_v1_nkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_v2_nkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-#endif
 
 template <typename ADataType,
           typename BDataType,
@@ -554,6 +55,82 @@ struct DeviceOperationInstanceFactory<
     {
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 
+#ifdef CK_USE_WMMA
+#ifdef CK_ENABLE_FP16
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                     is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_default_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_default_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_default_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_default_instances(op_ptrs);
+            }
+        }
+#endif
+#ifdef CK_ENABLE_BF16
+        if constexpr(is_same_v<ADataType, bhalf_t> && is_same_v<BDataType, bhalf_t> &&
+                     is_same_v<CDataType, bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_default_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instances(
+                    op_ptrs);
+            }
+        }
+#endif
+#if(defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP8))
+        if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, f8_t> &&
+                     is_same_v<CDataType, bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_default_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_default_instances(op_ptrs);
+            }
+        }
+#endif
+#endif // CK_USE_WMMA
+
+#ifdef CK_USE_XDL
 #ifdef CK_ENABLE_FP16
         if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
                      is_same_v<CDataType, half_t>)
@@ -822,6 +399,7 @@ struct DeviceOperationInstanceFactory<
         }
 #endif
 
+#ifdef CK_ENABLE_FP16
         if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, pk_i4_t> &&
                      is_same_v<CDataType, half_t>)
         {
@@ -831,7 +409,8 @@ struct DeviceOperationInstanceFactory<
                 add_device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_v2_default_instances(op_ptrs);
             }
         }
-
+#endif
+#ifdef CK_ENABLE_BF16
         if constexpr(is_same_v<ADataType, bhalf_t> && is_same_v<BDataType, pk_i4_t> &&
                      is_same_v<CDataType, bhalf_t>)
         {
@@ -842,6 +421,8 @@ struct DeviceOperationInstanceFactory<
                     op_ptrs);
             }
         }
+#endif
+#endif // CK_USE_XDL
 
         return op_ptrs;
     }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp
index 18203e7d5c..372e744bd7 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp
@@ -635,7 +635,7 @@ void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadd
                                                       PassThrough>>>& instances);
 #endif
 
-#if(defined(CK_ENABLE_FP8))
+#if(defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP8))
 void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instances(
     std::vector<std::unique_ptr<
         DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
@@ -834,6 +834,83 @@ void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding
         instances);
 #endif
 
+#if(defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP8))
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_nkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_nkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif
+
 template <typename ADataType,
           typename BDataType,
           typename CDataType,
@@ -1027,7 +1104,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemm_S
         }
 #endif
 
-#if(defined(CK_ENABLE_FP8))
+#if(defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP8))
         if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, f8_t> &&
                      is_same_v<CDataType, half_t>)
         {
@@ -1141,6 +1218,54 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemm_S
             }
         }
 #endif
+#if(defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP8))
+        if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, f8_t> &&
+                     is_same_v<CDataType, bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_nkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_nkpadding_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_kpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instances(
+                    op_ptrs);
+            }
+        }
+#endif
 
         return op_ptrs;
     }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_wmma.inc b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_wmma.inc
new file mode 100644
index 0000000000..1396437326
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_wmma.inc
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+#ifdef CK_ENABLE_FP16
+void add_device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif
+#ifdef CK_ENABLE_BF16
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif
+#if(defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP8))
+void add_device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_xdl.inc
new file mode 100644
index 0000000000..f0de713834
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_xdl.inc
@@ -0,0 +1,521 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+#ifdef CK_ENABLE_FP16
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif
+#if(defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP8))
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, I4, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, I4, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif
+#ifdef CK_ENABLE_BF16
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif
+#if(defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP8))
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_v1_nkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_v2_nkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp
new file mode 100644
index 0000000000..e535ba0170
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F16  = ck::half_t;
+using F32  = float;
+using BF8  = ck::bf8_t;
+using F8   = ck::f8_t;
+
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using namespace ck::tensor_layout::convolution;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// f16_f16_f32_f16
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionBackwardDataSpecialization ConvSpec>
+using device_grouped_conv_bwd_data_transpose_xdl_f16_instances =
+    std::tuple<
+        // clang-format off
+        // ##############################################|       NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer| LoopSched| AComputeType| BComputeType| MaxTranspose| MaxTranspose|
+        // ##############################################|    Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|          |             |             |   TransferIn|  TransferOut|
+        // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|          |             |             |    ScalarPer|    ScalarPer|
+        // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |          |             |             |       Vector|       Vector|
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8, make_default_loop_scheduler(), F16, F16, 2, 2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 4>,                8, make_default_loop_scheduler(), F16, F16, 2, 2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8, make_default_loop_scheduler(), F16, F16, 2, 2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8, make_default_loop_scheduler(), F16, F16, 2, 2>,
+
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8, make_default_loop_scheduler(), F16, F16, 4, 4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 4>,                8, make_default_loop_scheduler(), F16, F16, 4, 4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8, make_default_loop_scheduler(), F16, F16, 4, 4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8, make_default_loop_scheduler(), F16, F16, 4, 4>,
+
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8, make_default_loop_scheduler(), F16, F16, 1, 2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 4>,                8, make_default_loop_scheduler(), F16, F16, 1, 2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8, make_default_loop_scheduler(), F16, F16, 1, 2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8, make_default_loop_scheduler(), F16, F16, 1, 2>,
+
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8, make_default_loop_scheduler(), F16, F16, 2, 1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 4>,                8, make_default_loop_scheduler(), F16, F16, 2, 1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8, make_default_loop_scheduler(), F16, F16, 2, 1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8, make_default_loop_scheduler(), F16, F16, 2, 1>
+        // clang-format on
+        >;
+
+// bf16_bf16_f32_bf16
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionBackwardDataSpecialization ConvSpec>
+using device_grouped_conv_bwd_data_transpose_xdl_bf16_instances =
+    std::tuple<
+        // clang-format off
+        // ##############################################|       NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer| LoopSched| AComputeType| BComputeType| MaxTranspose| MaxTranspose|
+        // ##############################################|    Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|          |             |             |   TransferIn|  TransferOut|
+        // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|          |             |             |    ScalarPer|    ScalarPer|
+        // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |          |             |             |       Vector|       Vector|
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,   S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8, make_default_loop_scheduler(), BF16, BF16, 2, 2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,    S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 4>,                8, make_default_loop_scheduler(), BF16, BF16, 2, 2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,    S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8, make_default_loop_scheduler(), BF16, BF16, 2, 2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,    S<4, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8, make_default_loop_scheduler(), BF16, BF16, 2, 2>,
+        
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,   S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8, make_default_loop_scheduler(), BF16, BF16, 4, 4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,    S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 4>,                8, make_default_loop_scheduler(), BF16, BF16, 4, 4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,    S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8, make_default_loop_scheduler(), BF16, BF16, 4, 4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,    S<4, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8, make_default_loop_scheduler(), BF16, BF16, 4, 4>,
+        
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,   S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8, make_default_loop_scheduler(), BF16, BF16, 1, 2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,    S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 4>,                8, make_default_loop_scheduler(), BF16, BF16, 1, 2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,    S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8, make_default_loop_scheduler(), BF16, BF16, 1, 2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,    S<4, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8, make_default_loop_scheduler(), BF16, BF16, 1, 2>,
+        
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,   S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8, make_default_loop_scheduler(), BF16, BF16, 2, 1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,    S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 4>,                8, make_default_loop_scheduler(), BF16, BF16, 2, 1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,    S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8, make_default_loop_scheduler(), BF16, BF16, 2, 1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,    S<4, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8, make_default_loop_scheduler(), BF16, BF16, 2, 1>
+        // clang-format on
+        >;
+
+// f32_f32_f32_f32
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionBackwardDataSpecialization ConvSpec>
+using device_grouped_conv_bwd_data_transpose_xdl_f32_instances =
+    std::tuple<
+        // clang-format off
+        // ##############################################|       NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer| LoopSched| AComputeType| BComputeType| MaxTranspose| MaxTranspose|
+        // ##############################################|    Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|          |             |             |   TransferIn|  TransferOut|
+        // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|          |             |             |    ScalarPer|    ScalarPer|
+        // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |          |             |             |       Vector|       Vector|
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4, make_default_loop_scheduler(), F32, F32, 2, 2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 4>,                4, make_default_loop_scheduler(), F32, F32, 2, 2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 4>,                4, make_default_loop_scheduler(), F32, F32, 2, 2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 4>,                4, make_default_loop_scheduler(), F32, F32, 2, 2>,
+
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4, make_default_loop_scheduler(), F32, F32, 4, 4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 4>,                4, make_default_loop_scheduler(), F32, F32, 4, 4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 4>,                4, make_default_loop_scheduler(), F32, F32, 4, 4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 4>,                4, make_default_loop_scheduler(), F32, F32, 4, 4>,
+
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4, make_default_loop_scheduler(), F32, F32, 1, 2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 4>,                4, make_default_loop_scheduler(), F32, F32, 1, 2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 4>,                4, make_default_loop_scheduler(), F32, F32, 1, 2>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 4>,                4, make_default_loop_scheduler(), F32, F32, 1, 2>,
+
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4, make_default_loop_scheduler(), F32, F32, 2, 1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 4>,                4, make_default_loop_scheduler(), F32, F32, 2, 1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 4>,                4, make_default_loop_scheduler(), F32, F32, 2, 1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 4>,                4, make_default_loop_scheduler(), F32, F32, 2, 1>
+        // clang-format on
+        >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp
index ee86950992..5c0d7283f2 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -36,6 +36,46 @@ static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 =
     ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0;
 
 // f16_f16_f32_f16
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionBackwardDataSpecialization ConvSpec>
+using device_grouped_conv_bwd_data_xdl_f16_generic_instances =
+    std::tuple<
+        // clang-format off
+        // ##############################################|       NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+        // ##############################################|    Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+        // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+        // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+        // generic instance
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,            1,            1,     S<1, 16, 1, 4>,                1>
+        // clang-format on
+        >;
+
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionBackwardDataSpecialization ConvSpec>
+using device_grouped_conv_bwd_data_xdl_f16_16_16_instances =
+    std::tuple<
+        // clang-format off
+        // ##############################################|       NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+        // ##############################################|    Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+        // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+        // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    16,    64,    32,   8,   8,   16,   16,       1,       4,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 8, 1>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    16,    64,    32,   8,   8,   16,   16,       1,       4,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,            1,            1,     S<1, 16, 1, 4>,                1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    16,    64,    32,   8,   8,   16,   16,       1,       4,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,        S<4, 8, 1>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    16,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 2, 1>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    16,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,            1,            1,     S<1, 16, 1, 4>,                1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    16,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,        S<4, 2, 1>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>
+        // clang-format on
+        >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
@@ -55,7 +95,7 @@ using device_grouped_conv_bwd_data_xdl_f16_instances =
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,    64,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,            1,            1,     S<1, 32, 1, 4>,                1>,
 
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   2,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,            1,            1,     S<1, 32, 1, 8>,                8>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
@@ -73,6 +113,44 @@ using device_grouped_conv_bwd_data_xdl_f16_instances =
         >;
 
 // bf16_bf16_f32_bf16
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionBackwardDataSpecialization ConvSpec>
+using device_grouped_conv_bwd_data_xdl_bf16_generic_instances = std::tuple<
+    // clang-format off
+        // ##############################################|          NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+        // ##############################################|       Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+        // ##############################################|              |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+        // ##############################################|              |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+        // generic instance
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,    S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,            1,            1,     S<1, 16, 1, 4>,                1>
+    // clang-format on
+    >;
+
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionBackwardDataSpecialization ConvSpec>
+using device_grouped_conv_bwd_data_xdl_bf16_16_16_instances = std::tuple<
+    // clang-format off
+        // ##############################################|          NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+        // ##############################################|       Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+        // ##############################################|              |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+        // ##############################################|              |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    16,    64,    32,   8,   8,   16,   16,       1,       4,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 8, 1>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    16,    64,    32,   8,   8,   16,   16,       1,       4,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,            1,            1,     S<1, 16, 1, 4>,                1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    16,    64,    32,   8,   8,   16,   16,       1,       4,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,        S<4, 8, 1>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    16,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 2, 1>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    16,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,            1,            1,     S<1, 16, 1, 4>,                1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    16,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,        S<4, 2, 1>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>
+    // clang-format on
+    >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
@@ -91,7 +169,7 @@ using device_grouped_conv_bwd_data_xdl_bf16_instances = std::tuple<
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,    64,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,   S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,    S<4, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,            1,            1,     S<1, 32, 1, 4>,                1>,
 
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   2,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,   S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,   S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,            1,            1,     S<1, 32, 1, 8>,                8>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,   S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,   S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,   S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
@@ -109,6 +187,46 @@ using device_grouped_conv_bwd_data_xdl_bf16_instances = std::tuple<
     >;
 
 // f32_f32_f32_f32
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionBackwardDataSpecialization ConvSpec>
+using device_grouped_conv_bwd_data_xdl_f32_generic_instances =
+    std::tuple<
+        // clang-format off
+         // ##############################################|       NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+         // ##############################################|    Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+         // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+         // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+        // generic instance
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,            1,            1,     S<1, 16, 1, 4>,                1>
+        // clang-format on
+        >;
+
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionBackwardDataSpecialization ConvSpec>
+using device_grouped_conv_bwd_data_xdl_f32_16_16_instances =
+    std::tuple<
+        // clang-format off
+       // ##############################################|       NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+       // ##############################################|    Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+       // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+       // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+       DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    16,    64,    32,   8,   8,   16,   16,       1,       4,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,        S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>,
+       DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    16,    64,    32,   8,   8,   16,   16,       1,       4,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,        S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,            1,            1,     S<1, 16, 1, 4>,                1>,
+       DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    16,    64,    32,   8,   8,   16,   16,       1,       4,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,        S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>,
+       DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    16,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,        S<4, 4, 1>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>,
+       DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    16,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,        S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,            1,            1,     S<1, 16, 1, 4>,                1>,
+       DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    16,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,        S<4, 4, 1>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>
+        // clang-format on
+        >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
@@ -128,7 +246,7 @@ using device_grouped_conv_bwd_data_xdl_f32_instances =
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,    64,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,            1,            1,     S<1, 32, 1, 4>,                1>,
 
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   2,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,            1,            1,     S<1, 32, 1, 8>,                4>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 8>,                4>,
@@ -141,7 +259,7 @@ using device_grouped_conv_bwd_data_xdl_f32_instances =
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 4>,                4>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,    32,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 8>,                4>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 4>,                4>,
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    32,    64,    32,   8,   8,   32,   32,       1,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8>
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    32,    64,    32,   8,   8,   32,   32,       1,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>
         // clang-format on
         >;
 
@@ -165,7 +283,7 @@ using device_grouped_conv_bwd_data_xdl_input_fp16_comp_bf8f8_instances =
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,    64,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,            1,            1,     S<1, 32, 1, 4>,                1,  LoopScheduler::Default, BF8, F8>,
 
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   2,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
index 3ebfd0bb7d..1c4dc8a445 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
@@ -41,13 +41,11 @@ template <ck::index_t NDimSpatial,
           BlockGemmPipelineVersion PipelineVersion>
 using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_generic_instances =
     std::tuple<
-// clang-format off
+        // clang-format off
         //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
         //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
         //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
         //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1>
         // clang-format on
         >;
@@ -60,14 +58,13 @@ template <ck::index_t NDimSpatial,
           BlockGemmPipelineScheduler Scheduler,
           BlockGemmPipelineVersion PipelineVersion>
 using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
         //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
         //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
         //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              2,              2,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2>,
 
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              2,              2,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              4,              4,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4>,
@@ -110,13 +107,11 @@ template <ck::index_t NDimSpatial,
           BlockGemmPipelineVersion PipelineVersion>
 using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_generic_instances =
     std::tuple<
-// clang-format off
+        // clang-format off
         //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
         //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
         //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
         //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1>
         // clang-format on
         >;
@@ -129,14 +124,13 @@ template <ck::index_t NDimSpatial,
           BlockGemmPipelineScheduler Scheduler,
           BlockGemmPipelineVersion PipelineVersion>
 using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
         //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
         //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
         //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              2,              2,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2>,
 
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              2,              2,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              4,              4,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4>,
@@ -179,13 +173,11 @@ template <ck::index_t NDimSpatial,
           BlockGemmPipelineVersion PipelineVersion>
 using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_generic_instances =
     std::tuple<
-// clang-format off
+        // clang-format off
         //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
         //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
         //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
         //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1, F16, F16, 1, 1>
         // clang-format on
         >;
@@ -199,13 +191,11 @@ template <ck::index_t NDimSpatial,
           BlockGemmPipelineScheduler Scheduler,
           BlockGemmPipelineVersion PipelineVersion>
 using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
         //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
         //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
         //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1, F16, F16, 1, 1>,
     
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              2,              2,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2, F16, F16, 2, 2>,
@@ -241,13 +231,11 @@ template <ck::index_t NDimSpatial,
           BlockGemmPipelineVersion PipelineVersion>
 using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_generic_instances =
     std::tuple<
-// clang-format off
+        // clang-format off
         //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
         //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
         //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
         //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1, BF16, BF16, 1, 1>
         // clang-format on
         >;
@@ -260,13 +248,11 @@ template <ck::index_t NDimSpatial,
           BlockGemmPipelineScheduler Scheduler,
           BlockGemmPipelineVersion PipelineVersion>
 using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
         //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
         //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
         //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1, BF16, BF16, 1, 1>,
     
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              2,              2,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2, BF16, BF16, 2, 2>,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
index f9b3b43795..6c0ba2f932 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
@@ -1,9 +1,10 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
@@ -49,6 +50,23 @@ static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
 static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
+// double rate mfma instances on gfx950
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_bf16_comp_instances_2x = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,  16,  16,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+    // clang-format on
+    >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
@@ -56,33 +74,63 @@ template <index_t NDimSpatial,
           typename ELayout,
           ConvolutionForwardSpecialization ConvSpec>
 using device_grouped_conv_fwd_xdl_bf16_comp_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,  16,  16,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-#else // defined(CK_USE_AMD_MFMA_GFX950)
         // Compute friendly
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    32,   8,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        // AGPR Spill when use permuted lds layout. so, use padding for these two.
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 64, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   256,    32,   8,   8,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   128,    32,   8,   8,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,    64,    64,   8,   8,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,    64,   128,    64,   8,   8,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,    64,    64,    64,   8,   8,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,    64,    64,    64,   8,   8,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        // mfma 16x16
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<   NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding, 1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<   NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding, 1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<   NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding, 1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<   NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding, 1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               8>
+    // clang-format on
+    >;
+
+// instances not working on gfx950
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_bf16_comp_instances_part2 = std::tuple<
+    // clang-format off
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        // AGPR Spill when use permuted lds layout. so, use padding for these two.
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 64, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>
+    // clang-format on
+    >;
+
+// double rate mfma instances on gfx950
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_f16_comp_instances_2x = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,  16,  16,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
 
@@ -93,16 +141,30 @@ template <index_t NDimSpatial,
           typename ELayout,
           ConvolutionForwardSpecialization ConvSpec>
 using device_grouped_conv_fwd_xdl_f16_comp_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,  16,  16,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-#else // defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        // mfma 16x16
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<   NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding, 1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<   NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding, 1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<   NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding, 1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<   NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding, 1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               8>
+    // clang-format on
+    >;
+
+// instances not working on gfx950
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_f16_comp_instances_part2 = std::tuple<
+    // clang-format off
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    32,   8,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
@@ -115,7 +177,6 @@ using device_grouped_conv_fwd_xdl_f16_comp_instances = std::tuple<
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   256,    32,   8,   8,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   128,    32,   8,   8,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
     // clang-format on
     >;
 
@@ -134,7 +195,28 @@ using device_grouped_conv_fwd_xdl_f32_comp_instances = std::tuple<
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    32,   8,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
+        // mfma 16x16
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<    NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<    NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<    NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               4>
+    // clang-format on
+    >;
+
+// double rate mfma instances on gfx950
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_int8_comp_instances_2x = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,DsLayout,ELayout,int8_t,int8_t,int32_t,   int8_t,    DsLayout,int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,             256,   128,   128,   128,  32,  32,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,              16,        1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         1,            1,           1,              S<1, 64, 1, 4>,              16,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
 
@@ -145,16 +227,25 @@ template <index_t NDimSpatial,
           typename ELayout,
           ConvolutionForwardSpecialization ConvSpec>
 using device_grouped_conv_fwd_xdl_int8_comp_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,DsLayout,ELayout,int8_t,int8_t,int32_t,   int8_t,    DsLayout,int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,             256,   128,   128,   128,  32,  32,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,              16,        1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         1,            1,           1,              S<1, 64, 1, 4>,              16,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-#else // defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    32,   8,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>
+    // clang-format on
+    >;
+
+// instances not working on gfx950
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_int8_comp_instances_part2 = std::tuple<
+    // clang-format off
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    32,   8,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
@@ -163,10 +254,8 @@ using device_grouped_conv_fwd_xdl_int8_comp_instances = std::tuple<
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   256,    32,   8,   8,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   128,    32,   8,   8,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
     // clang-format on
     >;
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
index af79eefa1f..c9ea462316 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -46,6 +46,23 @@ static constexpr auto ConvFwdOddC =
 
 static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_bf16_generic_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        // generic instance
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>
+    // clang-format on
+    >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
@@ -80,6 +97,23 @@ using device_grouped_conv_fwd_xdl_bf16_instances = std::tuple<
     // clang-format on
     >;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_f16_generic_instances = std::tuple<
+    // clang-format off
+  //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+  //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+  //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+  //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+  // generic instance
+  DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>
+    // clang-format on
+    >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
@@ -114,6 +148,23 @@ using device_grouped_conv_fwd_xdl_f16_instances = std::tuple<
     // clang-format on
     >;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_f32_generic_instances = std::tuple<
+    // clang-format off
+  //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+  //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+  //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+  //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+  // generic instance
+  DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1>
+    // clang-format on
+    >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
@@ -148,6 +199,23 @@ using device_grouped_conv_fwd_xdl_f32_instances = std::tuple<
     // clang-format on
     >;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_int8_generic_instances = std::tuple<
+    // clang-format off
+  //########################################|     NumDim|      A|      B|          Ds|      E|  AData|  BData| AccData| CShuffle|          Ds|  EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+  //########################################|    Spatial| Layout| Layout|      Layout| Layout|   Type|   Type|    Type| DataType|    DataType|   Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+  //########################################|           |       |       |            |       |       |       |        |         |            |       |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+  //########################################|           |       |       |            |       |       |       |        |         |            |       |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+  // generic instance
+  DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsLayout, int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>
+    // clang-format on
+    >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp
index d317d270ce..0a85cde3bc 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -46,6 +46,7 @@ using device_grouped_conv_fwd_xdl_large_tensor_bf16_instances = std::tuple<
         // generic instance
         DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
 
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               2>,
         DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>
     // clang-format on
     >;
@@ -65,6 +66,7 @@ using device_grouped_conv_fwd_xdl_large_tensor_f16_instances = std::tuple<
         // generic instance
         DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
 
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               2>,
         DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>
     // clang-format on
     >;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
index 9114d5c1fb..153cc61b09 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
@@ -40,21 +40,34 @@ template <index_t NDimSpatial,
           typename ELayout,
           ConvolutionForwardSpecialization ConvSpec>
 using device_grouped_conv_fwd_xdl_merged_groups_bf16_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| ACompute| BCompute| BlockGemm| NumGroups|
         //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|     Type|     Type|  Pipeline|   ToMerge|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|         |         | Scheduler|          |
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |         |         |          |          |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 16>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 32>
-#else // defined(CK_USE_AMD_MFMA_GFX950)
         // Instances with NumGroupsPerBatch > 1
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 8>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 16>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 32>
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
+    // clang-format on
+    >;
+
+// double rate mfma instances on gfx950
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_merged_groups_bf16_instances_2x = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| ACompute| BCompute| BlockGemm| NumGroups|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|     Type|     Type|  Pipeline|   ToMerge|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|         |         | Scheduler|          |
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |         |         |          |          |
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 16>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 32>
     // clang-format on
     >;
 
@@ -65,22 +78,35 @@ template <index_t NDimSpatial,
           typename ELayout,
           ConvolutionForwardSpecialization ConvSpec>
 using device_grouped_conv_fwd_xdl_merged_groups_f16_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        // Instances with NumGroupsPerBatch > 1
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 16>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 32>
-#else // defined(CK_USE_AMD_MFMA_GFX950)
         // Instances with NumGroupsPerBatch > 1
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 8>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 16>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 32>
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
+    // clang-format on
+    >;
+
+// double rate mfma instances on gfx950
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_merged_groups_f16_instances_2x = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        // Instances with NumGroupsPerBatch > 1
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 16>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 32>
     // clang-format on
     >;
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
index 9a70a47274..e9ff75a91d 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -109,6 +109,8 @@ struct DeviceOperationInstanceFactory<
                              is_same_v<ComputeTypeB, F16>)
                 {
                     add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_instances(op_ptrs);
+                    add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_16_16_instances(
+                        op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_FP32
@@ -117,6 +119,8 @@ struct DeviceOperationInstanceFactory<
                              is_same_v<ComputeTypeB, F32>)
                 {
                     add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_instances(op_ptrs);
+                    add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_16_16_instances(
+                        op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_BF16
@@ -126,6 +130,78 @@ struct DeviceOperationInstanceFactory<
                 {
                     add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_instances(
                         op_ptrs);
+                    add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_16_16_instances(
+                        op_ptrs);
+                }
+#endif
+            }
+            if constexpr(is_same_v<InLayout, NGCHW> && is_same_v<WeiLayout, GKYXC> &&
+                         is_same_v<OutLayout, NGKHW>)
+            {
+#ifdef CK_ENABLE_FP16
+                if constexpr(is_same_v<InDataType, F16> && is_same_v<WeiDataType, F16> &&
+                             is_same_v<OutDataType, F16> && is_same_v<ComputeTypeA, F16> &&
+                             is_same_v<ComputeTypeB, F16>)
+                {
+                    add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkyxc_ngchw_f16_instances(op_ptrs);
+                }
+#endif
+#ifdef CK_ENABLE_FP32
+                if constexpr(is_same_v<InDataType, F32> && is_same_v<WeiDataType, F32> &&
+                             is_same_v<OutDataType, F32> && is_same_v<ComputeTypeA, F32> &&
+                             is_same_v<ComputeTypeB, F32>)
+                {
+                    add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkyxc_ngchw_f32_instances(op_ptrs);
+                }
+#endif
+#ifdef CK_ENABLE_BF16
+                if constexpr(is_same_v<InDataType, BF16> && is_same_v<WeiDataType, BF16> &&
+                             is_same_v<OutDataType, BF16> && is_same_v<ComputeTypeA, BF16> &&
+                             is_same_v<ComputeTypeB, BF16>)
+                {
+                    add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkyxc_ngchw_bf16_instances(
+                        op_ptrs);
+                }
+#endif
+            }
+            if constexpr(is_same_v<InLayout, NGCHW> && is_same_v<WeiLayout, GKCYX> &&
+                         is_same_v<OutLayout, NGKHW>)
+            {
+#ifdef CK_ENABLE_FP16
+                if constexpr(is_same_v<InDataType, F16> && is_same_v<WeiDataType, F16> &&
+                             is_same_v<OutDataType, F16> && is_same_v<ComputeTypeA, F16> &&
+                             is_same_v<ComputeTypeB, F16>)
+                {
+                    add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_instances(op_ptrs);
+                    add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_16_16_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_vec_transpose_instances(
+                        op_ptrs);
+                }
+#endif
+#ifdef CK_ENABLE_FP32
+                if constexpr(is_same_v<InDataType, F32> && is_same_v<WeiDataType, F32> &&
+                             is_same_v<OutDataType, F32> && is_same_v<ComputeTypeA, F32> &&
+                             is_same_v<ComputeTypeB, F32>)
+                {
+                    add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_instances(op_ptrs);
+                    add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_16_16_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_vec_transpose_instances(
+                        op_ptrs);
+                }
+#endif
+#ifdef CK_ENABLE_BF16
+                if constexpr(is_same_v<InDataType, BF16> && is_same_v<WeiDataType, BF16> &&
+                             is_same_v<OutDataType, BF16> && is_same_v<ComputeTypeA, BF16> &&
+                             is_same_v<ComputeTypeB, BF16>)
+                {
+                    add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_16_16_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_vec_transpose_instances(
+                        op_ptrs);
                 }
 #endif
             }
@@ -173,6 +249,8 @@ struct DeviceOperationInstanceFactory<
                 {
                     add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_instances(
                         op_ptrs);
+                    add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_16_16_instances(
+                        op_ptrs);
                 }
 #endif
 #if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
@@ -191,6 +269,8 @@ struct DeviceOperationInstanceFactory<
                 {
                     add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_instances(
                         op_ptrs);
+                    add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_16_16_instances(
+                        op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_BF16
@@ -200,6 +280,82 @@ struct DeviceOperationInstanceFactory<
                 {
                     add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_instances(
                         op_ptrs);
+                    add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_16_16_instances(
+                        op_ptrs);
+                }
+#endif
+            }
+            if constexpr(is_same_v<InLayout, NGCDHW> && is_same_v<WeiLayout, GKZYXC> &&
+                         is_same_v<OutLayout, NGKDHW>)
+            {
+#ifdef CK_ENABLE_FP16
+                if constexpr(is_same_v<InDataType, F16> && is_same_v<WeiDataType, F16> &&
+                             is_same_v<OutDataType, F16> && is_same_v<ComputeTypeA, F16> &&
+                             is_same_v<ComputeTypeB, F16>)
+                {
+                    add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkzyxc_ngcdhw_f16_instances(
+                        op_ptrs);
+                }
+#endif
+#ifdef CK_ENABLE_FP32
+                if constexpr(is_same_v<InDataType, F32> && is_same_v<WeiDataType, F32> &&
+                             is_same_v<OutDataType, F32> && is_same_v<ComputeTypeA, F32> &&
+                             is_same_v<ComputeTypeB, F32>)
+                {
+                    add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkzyxc_ngcdhw_f32_instances(
+                        op_ptrs);
+                }
+#endif
+#ifdef CK_ENABLE_BF16
+                if constexpr(is_same_v<InDataType, BF16> && is_same_v<WeiDataType, BF16> &&
+                             is_same_v<OutDataType, BF16> && is_same_v<ComputeTypeA, BF16> &&
+                             is_same_v<ComputeTypeB, BF16>)
+                {
+                    add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkzyxc_ngcdhw_bf16_instances(
+                        op_ptrs);
+                }
+#endif
+            }
+            if constexpr(is_same_v<InLayout, NGCDHW> && is_same_v<WeiLayout, GKCZYX> &&
+                         is_same_v<OutLayout, NGKDHW>)
+            {
+#ifdef CK_ENABLE_FP16
+                if constexpr(is_same_v<InDataType, F16> && is_same_v<WeiDataType, F16> &&
+                             is_same_v<OutDataType, F16> && is_same_v<ComputeTypeA, F16> &&
+                             is_same_v<ComputeTypeB, F16>)
+                {
+                    add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_16_16_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_vec_transpose_instances(
+                        op_ptrs);
+                }
+#endif
+#ifdef CK_ENABLE_FP32
+                if constexpr(is_same_v<InDataType, F32> && is_same_v<WeiDataType, F32> &&
+                             is_same_v<OutDataType, F32> && is_same_v<ComputeTypeA, F32> &&
+                             is_same_v<ComputeTypeB, F32>)
+                {
+                    add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_16_16_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_vec_transpose_instances(
+                        op_ptrs);
+                }
+#endif
+#ifdef CK_ENABLE_BF16
+                if constexpr(is_same_v<InDataType, BF16> && is_same_v<WeiDataType, BF16> &&
+                             is_same_v<OutDataType, BF16> && is_same_v<ComputeTypeA, BF16> &&
+                             is_same_v<ComputeTypeB, BF16>)
+                {
+                    add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_16_16_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_vec_transpose_instances(
+                        op_ptrs);
                 }
 #endif
             }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data_xdl.inc
index 7ad0218410..c723be0db8 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data_xdl.inc
@@ -69,6 +69,20 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_instances(
                                                                   PassThrough,
                                                                   PassThrough,
                                                                   PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NHWGK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGC,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_instances(
@@ -84,6 +98,20 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_instances(
                                                                   PassThrough,
                                                                   PassThrough,
                                                                   PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NHWGK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGC,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_BF16
 void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_instances(
@@ -99,6 +127,196 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_instances(
                                                                   PassThrough,
                                                                   PassThrough,
                                                                   PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NHWGK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGC,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkyxc_ngchw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+#endif
+#ifdef CK_ENABLE_FP32
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkyxc_ngchw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+#endif
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkyxc_ngchw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_vec_transpose_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+#endif
+#ifdef CK_ENABLE_FP32
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_vec_transpose_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+#endif
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_vec_transpose_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
 #endif
 
 // conv3d backward data
@@ -161,6 +379,20 @@ void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_instances(
                                                                   PassThrough,
                                                                   PassThrough,
                                                                   PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NDHWGK,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGC,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_instances(
@@ -176,6 +408,20 @@ void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_instances(
                                                                   PassThrough,
                                                                   PassThrough,
                                                                   PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NDHWGK,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGC,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_BF16
 void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_instances(
@@ -191,6 +437,20 @@ void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_instances(
                                                                   PassThrough,
                                                                   PassThrough,
                                                                   PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NDHWGK,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGC,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
 #endif
 #if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
 void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_input_f16_comp_bf8f8_instances(
@@ -209,6 +469,180 @@ void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_input_f16_comp_
                                                                   BF8,
                                                                   F8>>>& instances);
 #endif
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkzyxc_ngcdhw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+#endif
+#ifdef CK_ENABLE_FP32
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkzyxc_ngcdhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+#endif
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkzyxc_ngcdhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+#endif
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_vec_transpose_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+#endif
+#ifdef CK_ENABLE_FP32
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_vec_transpose_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+#endif
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_vec_transpose_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+#endif
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
index f1993eb149..e8e46a7329 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
@@ -431,6 +431,51 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_irregular_instances(
                         op_ptrs);
                 }
+#endif
+            }
+            if constexpr(is_same_v<InLayout, NGCHW> && is_same_v<WeiLayout, GKCYX> &&
+                         is_same_v<OutLayout, NGKHW>)
+            {
+#ifdef CK_ENABLE_FP16
+                if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                             is_same_v<OutDataType, half_t> && is_same_v<ComputeTypeA, half_t> &&
+                             is_same_v<ComputeTypeB, half_t>)
+                {
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f16_instances(
+                        op_ptrs);
+                }
+#endif
+#ifdef CK_ENABLE_BF16
+                if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                             is_same_v<WeiDataType, ck::bhalf_t> &&
+                             is_same_v<OutDataType, ck::bhalf_t> &&
+                             is_same_v<ComputeTypeA, ck::bhalf_t> &&
+                             is_same_v<ComputeTypeB, ck::bhalf_t>)
+                {
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_bf16_instances(
+                        op_ptrs);
+                }
+#endif
+#ifdef CK_ENABLE_FP32
+                if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                             is_same_v<OutDataType, float> && is_same_v<ComputeTypeA, float> &&
+                             is_same_v<ComputeTypeB, float>)
+                {
+                    add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f32_instances(
+                        op_ptrs);
+                }
 #endif
             }
             if constexpr(is_same_v<InLayout, NGCHW> && is_same_v<WeiLayout, GKYXC> &&
@@ -443,12 +488,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 {
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instances(
                         op_ptrs);
-                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instances(
-                        op_ptrs);
-                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instances(
-                        op_ptrs);
-                    add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f16_instances(
-                        op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_BF16
@@ -460,12 +499,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 {
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instances(
                         op_ptrs);
-                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instances(
-                        op_ptrs);
-                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instances(
-                        op_ptrs);
-                    add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_bf16_instances(
-                        op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_FP32
@@ -610,6 +643,51 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                     add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_f8_instances(
                         op_ptrs);
                 }
+#endif
+            }
+            if constexpr(is_same_v<InLayout, NGCDHW> && is_same_v<WeiLayout, GKCZYX> &&
+                         is_same_v<OutLayout, NGKDHW>)
+            {
+#ifdef CK_ENABLE_FP16
+                if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                             is_same_v<OutDataType, half_t> && is_same_v<ComputeTypeA, half_t> &&
+                             is_same_v<ComputeTypeB, half_t>)
+                {
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances(
+                        op_ptrs);
+                }
+#endif
+#ifdef CK_ENABLE_BF16
+                if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                             is_same_v<WeiDataType, ck::bhalf_t> &&
+                             is_same_v<OutDataType, ck::bhalf_t> &&
+                             is_same_v<ComputeTypeA, ck::bhalf_t> &&
+                             is_same_v<ComputeTypeB, ck::bhalf_t>)
+                {
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances(
+                        op_ptrs);
+                }
+#endif
+#ifdef CK_ENABLE_FP32
+                if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                             is_same_v<OutDataType, float> && is_same_v<ComputeTypeA, float> &&
+                             is_same_v<ComputeTypeB, float>)
+                {
+                    add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f32_instances(
+                        op_ptrs);
+                }
 #endif
             }
             if constexpr(is_same_v<InLayout, NGCDHW> && is_same_v<WeiLayout, GKZYXC> &&
@@ -622,12 +700,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 {
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instances(
                         op_ptrs);
-                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instances(
-                        op_ptrs);
-                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instances(
-                        op_ptrs);
-                    add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instances(
-                        op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_BF16
@@ -639,12 +711,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 {
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instances(
                         op_ptrs);
-                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instances(
-                        op_ptrs);
-                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instances(
-                        op_ptrs);
-                    add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instances(
-                        op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_FP32
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
index 31a536c7bb..2af5edf98c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
@@ -149,10 +149,10 @@ void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_bf16_instances(
+void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
-                                                           GKYXC,
+                                                           GKCYX,
                                                            NGKHW,
                                                            BF16,
                                                            BF16,
@@ -292,10 +292,10 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_p
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instances(
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
-                                                           GKYXC,
+                                                           GKCYX,
                                                            NGKHW,
                                                            BF16,
                                                            BF16,
@@ -304,10 +304,22 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_p
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instances(
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
-                                                           GKYXC,
+                                                           GKCYX,
+                                                           NGKHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKCYX,
                                                            NGKHW,
                                                            BF16,
                                                            BF16,
@@ -329,10 +341,10 @@ void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f16_instances(
+void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
-                                                           GKYXC,
+                                                           GKCYX,
                                                            NGKHW,
                                                            F16,
                                                            F16,
@@ -461,10 +473,10 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pi
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instances(
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
-                                                           GKYXC,
+                                                           GKCYX,
                                                            NGKHW,
                                                            F16,
                                                            F16,
@@ -473,10 +485,22 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pi
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instances(
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
-                                                           GKYXC,
+                                                           GKCYX,
+                                                           NGKHW,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKCYX,
                                                            NGKHW,
                                                            F16,
                                                            F16,
@@ -510,6 +534,18 @@ void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instances(
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
+void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKCYX,
+                                                           NGKHW,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
@@ -611,10 +647,10 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instances(
+void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
-                                                           GKZYXC,
+                                                           GKCZYX,
                                                            NGKDHW,
                                                            BF16,
                                                            BF16,
@@ -755,10 +791,10 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf1
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instances(
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
-                                                           GKZYXC,
+                                                           GKCZYX,
                                                            NGKDHW,
                                                            BF16,
                                                            BF16,
@@ -767,10 +803,22 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf1
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instances(
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
-                                                           GKZYXC,
+                                                           GKCZYX,
+                                                           NGKDHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
                                                            NGKDHW,
                                                            BF16,
                                                            BF16,
@@ -792,10 +840,10 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instances(
+void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
-                                                           GKZYXC,
+                                                           GKCZYX,
                                                            NGKDHW,
                                                            F16,
                                                            F16,
@@ -924,10 +972,10 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instances(
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
-                                                           GKZYXC,
+                                                           GKCZYX,
                                                            NGKDHW,
                                                            F16,
                                                            F16,
@@ -936,10 +984,22 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instances(
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
-                                                           GKZYXC,
+                                                           GKCZYX,
+                                                           NGKDHW,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
                                                            NGKDHW,
                                                            F16,
                                                            F16,
@@ -961,6 +1021,18 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instances
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
+void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
+                                                           NGKDHW,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
index 01415c2ddd..d5eed7592e 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -226,6 +226,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances(
                     op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances(
+                    op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances(
                     op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances(
@@ -245,6 +248,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances(
                     op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances(
+                    op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances(
                     op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances(
@@ -272,20 +278,20 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
 
         // layout NGCHW/GKYXC/NGKHW
         if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NGCHW> &&
-                     is_same_v<WeiLayout, GKYXC> && is_same_v<OutLayout, NGKHW>)
+                     is_same_v<WeiLayout, GKCYX> && is_same_v<OutLayout, NGKHW>)
         {
 #ifdef CK_ENABLE_FP32
             if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
                          is_same_v<OutDataType, float> && is_same_v<AComputeType, float> &&
                          is_same_v<BComputeType, float>)
             {
-                add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f32_instances(
+                add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkcyx_ngkhw_f32_instances(
                     op_ptrs);
-                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instances(op_ptrs);
-                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_comp_instances(op_ptrs);
-                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_intra_instances(
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_comp_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_mem_intra_instances(
                     op_ptrs);
-                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_inter_instances(
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_mem_inter_instances(
                     op_ptrs);
             }
 #endif
@@ -294,13 +300,16 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                          is_same_v<OutDataType, half_t> && is_same_v<AComputeType, half_t> &&
                          is_same_v<BComputeType, half_t>)
             {
-                add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f16_instances(
+                add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkcyx_ngkhw_f16_instances(
                     op_ptrs);
-                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instances(op_ptrs);
-                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_comp_instances(op_ptrs);
-                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_intra_instances(
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_2x_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_part2_instances(
                     op_ptrs);
-                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_inter_instances(
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_mem_inter_instances(
                     op_ptrs);
             }
 #endif
@@ -311,14 +320,49 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                          is_same_v<AComputeType, ck::bhalf_t> &&
                          is_same_v<BComputeType, ck::bhalf_t>)
             {
-                add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_instances(
+                add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkcyx_ngkhw_bf16_instances(
                     op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_2x_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_part2_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
+        }
+
+        // layout NGCHW/GKYXC/NGKHW
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NGCHW> &&
+                     is_same_v<WeiLayout, GKYXC> && is_same_v<OutLayout, NGKHW>)
+        {
+#ifdef CK_ENABLE_FP32
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float> && is_same_v<AComputeType, float> &&
+                         is_same_v<BComputeType, float>)
+            {
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instances(op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_FP16
+            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                         is_same_v<OutDataType, half_t> && is_same_v<AComputeType, half_t> &&
+                         is_same_v<BComputeType, half_t>)
+            {
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instances(op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_BF16
+            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                         is_same_v<WeiDataType, ck::bhalf_t> &&
+                         is_same_v<OutDataType, ck::bhalf_t> &&
+                         is_same_v<AComputeType, ck::bhalf_t> &&
+                         is_same_v<BComputeType, ck::bhalf_t>)
+            {
                 add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instances(op_ptrs);
-                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instances(op_ptrs);
-                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instances(
-                    op_ptrs);
-                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instances(
-                    op_ptrs);
             }
 #endif
 #ifdef CK_ENABLE_INT8
@@ -326,14 +370,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                          is_same_v<OutDataType, int8_t> && is_same_v<AComputeType, int8_t> &&
                          is_same_v<BComputeType, int8_t>)
             {
-                add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instances(
-                    op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instances(op_ptrs);
-                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_comp_instances(op_ptrs);
-                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instances(
-                    op_ptrs);
-                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instances(
-                    op_ptrs);
             }
 #endif
         }
@@ -486,7 +523,69 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
             }
 #endif
         }
+
+        // layout NGCDHW/GKCZYX/NGKDHW
+        if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, NGCDHW> &&
+                     is_same_v<WeiLayout, GKCZYX> && is_same_v<OutLayout, NGKDHW>)
+        {
+#ifdef CK_ENABLE_FP32
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float> && is_same_v<AComputeType, float> &&
+                         is_same_v<BComputeType, float>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f32_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_instances(op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instances(op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instances(
+                    op_ptrs);
+            }
 #endif
+#ifdef CK_ENABLE_FP16
+            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                         is_same_v<OutDataType, half_t> && is_same_v<AComputeType, half_t> &&
+                         is_same_v<BComputeType, half_t>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances(op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances(op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_2x_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_part2_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_BF16
+            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                         is_same_v<WeiDataType, ck::bhalf_t> &&
+                         is_same_v<OutDataType, ck::bhalf_t> &&
+                         is_same_v<AComputeType, ck::bhalf_t> &&
+                         is_same_v<BComputeType, ck::bhalf_t>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_bf16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances(op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances(op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_2x_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_part2_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
+        }
+
+#endif // CK_USE_XDL
 
 #ifdef CK_USE_WMMA
         if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
@@ -500,7 +599,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instances(op_ptrs);
-                add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instances(op_ptrs);
             }
 #endif
 #ifdef CK_ENABLE_INT8
@@ -511,7 +609,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instances(op_ptrs);
-                add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instances(op_ptrs);
             }
 #endif
         }
@@ -527,7 +624,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instances(op_ptrs);
-                add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instances(op_ptrs);
             }
 #endif
 #ifdef CK_ENABLE_INT8
@@ -538,7 +634,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instances(op_ptrs);
-                add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instances(op_ptrs);
             }
 #endif
         }
@@ -556,7 +651,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                     op_ptrs);
                 add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instances(
                     op_ptrs);
-                add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instances(op_ptrs);
             }
 #endif
 #ifdef CK_ENABLE_INT8
@@ -568,7 +662,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instances(
                     op_ptrs);
-                add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instances(op_ptrs);
             }
 #endif
         }
@@ -586,7 +679,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                     op_ptrs);
                 add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instances(
                     op_ptrs);
-                add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instances(op_ptrs);
             }
 #endif
 #ifdef CK_ENABLE_INT8
@@ -598,11 +690,10 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instances(
                     op_ptrs);
-                add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instances(op_ptrs);
             }
 #endif
         }
-#endif
+#endif // CK_USE_WMMA
 
         return op_ptrs;
     }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc
index 9a83e36b99..b830bdce71 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc
@@ -23,6 +23,34 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_FP16
@@ -39,6 +67,34 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_FP32
@@ -73,12 +129,40 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instances(
                                                                 PassThrough>>>& instances);
 #endif
 
-// grouped conv2d forward, NGCHW/GKYXC/NGKHW
+// grouped conv2d forward, NGCHW/GKCYX/NGKHW
 #ifdef CK_ENABLE_FP16
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_comp_instances(
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 F16,
@@ -91,10 +175,38 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_comp_instances(
 #endif
 
 #ifdef CK_ENABLE_BF16
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instances(
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 BF16,
@@ -107,10 +219,10 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instances(
 #endif
 
 #ifdef CK_ENABLE_FP32
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_comp_instances(
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_comp_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 F32,
@@ -122,22 +234,6 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_comp_instances(
                                                                 PassThrough>>>& instances);
 #endif
 
-#ifdef CK_ENABLE_INT8
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_comp_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NGCHW,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NGKHW,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
-#endif
-
 #ifdef CK_ENABLE_BF16
 // grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK
 void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances(
@@ -187,6 +283,111 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances(
                                                                 PassThrough>>>& instances);
 #endif
 
+// grouped conv3d forward, NGCDHW/GKCZYX/NGKDHW
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP32
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc
index 662fadadcf..00351ceefd 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc
@@ -73,12 +73,12 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance
                                                                 PassThrough>>>& instances);
 #endif
 
-// grouped conv2d forward, NGCHW/GKYXC/NGKHW
+// grouped conv2d forward, NGCHW/GKCYX/NGKHW
 #ifdef CK_ENABLE_FP16
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_inter_instances(
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_mem_inter_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 F16,
@@ -91,10 +91,10 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_inter_instances
 #endif
 
 #ifdef CK_ENABLE_BF16
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instances(
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_mem_inter_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 BF16,
@@ -107,10 +107,10 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instance
 #endif
 
 #ifdef CK_ENABLE_FP32
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_inter_instances(
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_mem_inter_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 F32,
@@ -122,22 +122,6 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_inter_instances
                                                                 PassThrough>>>& instances);
 #endif
 
-#ifdef CK_ENABLE_INT8
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NGCHW,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NGKHW,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
-#endif
-
 #ifdef CK_ENABLE_BF16
 // grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK
 void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances(
@@ -187,6 +171,55 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instan
                                                                 PassThrough>>>& instances);
 #endif
 
+// grouped conv3d forward, NGCDHW/GKCZYX/NGKDHW
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP32
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc
index f283fe8550..bd44116057 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc
@@ -73,12 +73,12 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance
                                                                 PassThrough>>>& instances);
 #endif
 
-// grouped conv2d forward, NGCHW/GKYXC/NGKHW
+// grouped conv2d forward, NGCHW/GKCYX/NGKHW
 #ifdef CK_ENABLE_FP16
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_intra_instances(
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_mem_intra_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 F16,
@@ -91,10 +91,10 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_intra_instances
 #endif
 
 #ifdef CK_ENABLE_BF16
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instances(
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_mem_intra_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 BF16,
@@ -107,10 +107,10 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instance
 #endif
 
 #ifdef CK_ENABLE_FP32
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_intra_instances(
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_mem_intra_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 F32,
@@ -122,22 +122,6 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_intra_instances
                                                                 PassThrough>>>& instances);
 #endif
 
-#ifdef CK_ENABLE_INT8
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NGCHW,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NGKHW,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
-#endif
-
 #ifdef CK_ENABLE_BF16
 // grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK
 void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances(
@@ -187,6 +171,55 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instan
                                                                 PassThrough>>>& instances);
 #endif
 
+// grouped conv3d forward, NGCDHW/GKCZYX/NGKDHW
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP32
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_wmma.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_wmma.inc
index 0ea24d0929..df4e95007d 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_wmma.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_wmma.inc
@@ -51,20 +51,6 @@ void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instances(
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
-void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
-
 void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 GNHWC,
@@ -107,20 +93,6 @@ void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instances(
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
-void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                GNHWC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                GNHWK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
-
 void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 GNDHWC,
@@ -163,20 +135,6 @@ void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
-void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                GNDHWC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                GNDHWK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
-
 void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
@@ -219,19 +177,6 @@ void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
-void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_FP16
@@ -291,20 +236,6 @@ void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instances(
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
-void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
-
 void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 GNDHWC,
@@ -347,20 +278,6 @@ void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instanc
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
-void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                GNDHWC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                GNDHWK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
-
 void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
@@ -403,20 +320,6 @@ void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instanc
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
-void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
-
 void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 GNHWC,
@@ -458,20 +361,6 @@ void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
-
-void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                GNHWC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                GNHWK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
 #endif
 
 } // namespace instance
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
index c977c89c94..d3624b0fd9 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
@@ -252,6 +252,55 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instances(
                                                                 PassThrough>>>& instances);
 #endif
 
+// grouped conv2d forward, NGCHW/GKCYX/NGKHW
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP32
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 #ifdef CK_ENABLE_BF16
 // grouped conv3d forward, GNDHWC/GKZYXC/GNDHWK
 void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instances(
@@ -468,6 +517,55 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf8_f8_instances(
                                                                 F8>>>& instances);
 #endif
 
+// grouped conv3d forward, NGCDHW/GKCZYX/NGKDHW
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP32
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc
index a81e1e07ba..9f54c4b633 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc
@@ -24,10 +24,10 @@ void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_inst
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
-void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_instances(
+void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkcyx_ngkhw_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 BF16,
@@ -54,10 +54,10 @@ void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_insta
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
-void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f16_instances(
+void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkcyx_ngkhw_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 F16,
@@ -84,10 +84,10 @@ void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_insta
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
-void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f32_instances(
+void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkcyx_ngkhw_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 F32,
@@ -114,10 +114,10 @@ void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_int8_inst
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
-void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instances(
+void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkcyx_ngkhw_int8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 int8_t,
@@ -178,6 +178,55 @@ void add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_in
                                                                 PassThrough>>>& instances);
 #endif
 
+// grouped conv3d forward, NGCDHW/GKCZYX/NGKDHW
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP32
+void add_device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index 5b88d5f25c..fe35d9ca76 100755
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -60,6 +60,13 @@ function(add_instance_library INSTANCE_NAME)
             list(REMOVE_ITEM ARGN "${source}")
         endif()
     endforeach()
+    # Do not build MX instances if gfx950 targets are not on the target list
+    foreach(source IN LISTS ARGN)
+        if(NOT INST_TARGETS MATCHES "gfx950" AND source MATCHES "_mx")
+            message("removing MX instance ${source} ")
+            list(REMOVE_ITEM ARGN "${source}")
+        endif()
+    endforeach()
     # Do not build WMMA instances if gfx11 targets are not on the target list
     foreach(source IN LISTS ARGN)
 	if(NOT INST_TARGETS MATCHES "gfx11" AND NOT INST_TARGETS MATCHES "gfx12" AND source MATCHES "_wmma")
@@ -74,56 +81,61 @@ function(add_instance_library INSTANCE_NAME)
          list(REMOVE_ITEM ARGN "${source}")
     endif()
     endforeach()
-    # Do not build gemm_universal_f8 or gemm_multiply_multiply_f8 for any targets except gfx94
+    # Do not build XDL gemm_universal_f8 or gemm_multiply_multiply_f8 for any targets except gfx94
     if(NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        foreach(source IN LISTS ARGN)
+            if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx95" AND source MATCHES "gemm_multiply_multiply" AND source MATCHES "_f8_")
+                message("removing gemm_multiply_multiply_f8 instance ${source} ")
+                list(REMOVE_ITEM ARGN "${source}")
+            endif()
+        endforeach()
+        foreach(source IN LISTS ARGN)
+            if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx95" AND source MATCHES "gemm_xdl_universal" AND source MATCHES "_f8_")
+                message("removing gemm_universal_f8 instance ${source} ")
+                list(REMOVE_ITEM ARGN "${source}")
+            endif()
+        endforeach()
+    endif()
+    # Do not build WMMA gemm_universal_f8 for any targets except gfx12+
     foreach(source IN LISTS ARGN)
-    if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx95" AND source MATCHES "gemm_multiply_multiply_xdl_f8")
-         message("removing gemm_multiply_multiply_f8 instance ${source} ")
-         list(REMOVE_ITEM ARGN "${source}")
-    endif()
+        if(NOT INST_TARGETS MATCHES "gfx12" AND source MATCHES "gemm_wmma_universal" AND source MATCHES "_f8_")
+            message("removing gemm_universal_f8 instance ${source} ")
+            list(REMOVE_ITEM ARGN "${source}")
+        endif()
     endforeach()
-    foreach(source IN LISTS ARGN)
-    if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx95" AND source MATCHES "gemm_xdl_universal" AND source MATCHES "_f8_")
-         message("removing gemm_universal_f8 instance ${source} ")
-         list(REMOVE_ITEM ARGN "${source}")
-    endif()
-    endforeach()
-    foreach(source IN LISTS ARGN)
-    if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx95" AND source MATCHES "batched_gemm_xdl_universal" AND source MATCHES "_f8_")
-         message("removing batched_gemm_universal_f8 instance ${source} ")
-         list(REMOVE_ITEM ARGN "${source}")
-    endif()
-    endforeach()
-    foreach(source IN LISTS ARGN)
-    if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx95" AND source MATCHES "gemm_xdl_universal_streamk" AND source MATCHES "_f8_")
-         message("removing gemm_universal_streamk_f8 instance ${source} ")
-         list(REMOVE_ITEM ARGN "${source}")
-    endif()
-    endforeach()
-    endif()
+    message("remaining instances: ${ARGN}")
     #only continue if there are some source files left on the list
     if(ARGN)
         set(INST_OBJ)
         foreach(source IN LISTS ARGN)
             set(INST_TARGETS ${SUPPORTED_GPU_TARGETS})
             if(source MATCHES "_xdl")
-                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
             elseif(source MATCHES "_wmma")
-                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx950)
+                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx950)
             elseif(source MATCHES "mha")
-                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+            elseif(source MATCHES "_mx")
+                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
             endif()
-            #only build the fp8 gemm instances for gfx908/90a if the build argument is set
+            #only build the fp8 gemm instances for gfx90a if the build argument is set, otherwise only build for gfx942/gfx950
             if(NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH)
                 if(source MATCHES "gemm_xdl_universal" AND source MATCHES "f8")
-                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
                 endif()
-                if(source MATCHES "gemm_multiply_multiply_f8")
-                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+                if(source MATCHES "gemm_multiply_multiply" AND source MATCHES "f8")
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
                 endif()
-                if(source MATCHES "bached_gemm_multiply_multiply_f8")
-                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+            else()
+                if(source MATCHES "gemm_xdl_universal" AND source MATCHES "f8")
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
                 endif()
+                if(source MATCHES "gemm_multiply_multiply" AND source MATCHES "f8")
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+                endif()
+            endif()
+            if(source MATCHES "gemm_wmma_universal" AND source MATCHES "f8")
+                list(FILTER INST_TARGETS INCLUDE REGEX "gfx12")
             endif()
             set(offload_targets)
             foreach(target IN LISTS INST_TARGETS)
@@ -243,7 +255,11 @@ FOREACH(subdir_path ${dir_list})
             message("Found only xdl instances, but gfx9 is not on the targets list. Skipping.")
             set(add_inst 0)
         endif()
-	if(("${cmake_instance}" MATCHES "ONLY WMMA_KERNELS") AND (NOT INST_TARGETS MATCHES "gfx11") AND (NOT INST_TARGETS MATCHES "gfx12"))
+        if(("${cmake_instance}" MATCHES "ONLY MX_KERNELS") AND (NOT INST_TARGETS MATCHES "gfx950"))
+            message("Found only MX instances, but gfx950 is not on the targets list. Skipping.")
+            set(add_inst 0)
+        endif()
+	    if(("${cmake_instance}" MATCHES "ONLY WMMA_KERNELS") AND (NOT INST_TARGETS MATCHES "gfx11") AND (NOT INST_TARGETS MATCHES "gfx12"))
             message("Found only wmma instances, but gfx11 is not on the targets list. Skipping.")
             set(add_inst 0)
         endif()
@@ -251,14 +267,18 @@ FOREACH(subdir_path ${dir_list})
             message("Found only xdl and dl instances, but gfx9 is not on the targets listand DL_KERNELS is not set. Skipping.")
             set(add_inst 0)
         endif()
-	if(("${cmake_instance}" MATCHES "ONLY XDL_AND_WMMA_KERNELS") AND (NOT INST_TARGETS MATCHES "gfx11") AND (NOT INST_TARGETS MATCHES "gfx12") AND (NOT INST_TARGETS MATCHES "gfx9"))
+	    if(("${cmake_instance}" MATCHES "ONLY XDL_AND_WMMA_KERNELS") AND (NOT INST_TARGETS MATCHES "gfx11") AND (NOT INST_TARGETS MATCHES "gfx12") AND (NOT INST_TARGETS MATCHES "gfx9"))
             message("Found only xdl and wmma instances, but gfx11 and gfx9 are not on the targets list. Skipping.")
             set(add_inst 0)
         endif()
-	if(("${cmake_instance}" MATCHES "XDL_DL_WMMA_KERNELS") AND (NOT INST_TARGETS MATCHES "gfx11") AND (NOT INST_TARGETS MATCHES "gfx12") AND (NOT INST_TARGETS MATCHES "gfx9") AND (NOT DEFINED DL_KERNELS))
+	    if(("${cmake_instance}" MATCHES "XDL_DL_WMMA_KERNELS") AND (NOT INST_TARGETS MATCHES "gfx11") AND (NOT INST_TARGETS MATCHES "gfx12") AND (NOT INST_TARGETS MATCHES "gfx9") AND (NOT DEFINED DL_KERNELS))
             message("Found xdl, dl, and wmma instances, but none of those meet the target list. Skipping.")
             set(add_inst 0)
         endif()
+        if(("${cmake_instance}" MATCHES "gemm_multiply_multiply" AND "${cmake_instance}" MATCHES "_f8_" ) AND (NOT INST_TARGETS MATCHES "gfx94") AND (NOT INST_TARGETS MATCHES "gfx95") AND (NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH))
+            message("Found gemm_multiply_multiply_f8 instances, but gfx94/gfx95 not on the target list. Skipping.")
+            set(add_inst 0)
+        endif()
         if((add_inst EQUAL 1))
             get_filename_component(target_dir ${subdir_path} NAME)
             add_subdirectory(${target_dir})
@@ -446,4 +466,3 @@ set(DEV_OPS_INC_DIRS
     ${PROJECT_SOURCE_DIR}/library/include/ck/
 )
 rocm_install(DIRECTORY ${DEV_OPS_INC_DIRS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck)
-
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
index dad67b396f..f3a32bab54 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
@@ -8,6 +8,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -27,15 +28,12 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
 // Compilation parameters for a[k, m] * b[k, n] = c[m, n]
 using device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //##########|        AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumGemmK|          LoopScheduler|                    Pipeline|
         //##########|         Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar| Prefetch|                       |                            |
         //##########|             |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector| Stage   |                       |                            |
         //##########|             |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |         |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -70,6 +68,17 @@ using device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances = std::tuple<
 #endif
     // clang-format on
     >;
+// double rate mfma instances on gfx950
+using device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances_2x = std::tuple<
+    // clang-format off
+        //##########|        AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumGemmK|          LoopScheduler|                    Pipeline|
+        //##########|         Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar| Prefetch|                       |                            |
+        //##########|             |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector| Stage   |                       |                            |
+        //##########|             |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |         |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
 
 void add_device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances(
     std::vector<std::unique_ptr<
@@ -78,6 +87,12 @@ void add_device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances(
 {
     add_device_operation_instances(instances,
                                    device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances, device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances_2x{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
index 62df9a81b5..43810d9a5e 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
@@ -8,6 +8,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -27,15 +28,12 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
 // Compilation parameters for a[k, m] * b[n, k] = c[m, n]
 using device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //##########|        AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumGemmK|          LoopScheduler|                    Pipeline|
         //##########|         Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar| Prefetch|                       |                            |
         //##########|             |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector| Stage   |                       |                            |
         //##########|             |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |         |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -70,6 +68,17 @@ using device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances = std::tuple<
 #endif
     // clang-format on
     >;
+// double rate mfma instances on gfx950
+using device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances_2x = std::tuple<
+    // clang-format off
+        //##########|        AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumGemmK|          LoopScheduler|                    Pipeline|
+        //##########|         Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar| Prefetch|                       |                            |
+        //##########|             |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector| Stage   |                       |                            |
+        //##########|             |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |         |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
 
 void add_device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances(
     std::vector<std::unique_ptr<
@@ -78,6 +87,12 @@ void add_device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances(
 {
     add_device_operation_instances(instances,
                                    device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances, device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances_2x{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
index 23e176f6e8..b3bf65325b 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
@@ -8,6 +8,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -26,29 +27,33 @@ using S = ck::Sequence<Is...>;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
 using device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_generic_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#################| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumGemmK|          LoopScheduler|                    Pipeline|
         //#################|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar| Prefetch|                       |                            |
         //#################|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector| Stage   |                       |                            |
         //#################|      |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |         |                       |                            |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,    64,    16,    16,     4, 16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,    64,    16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>
     // clang-format on
     >;
+// double rate mfma instances on gfx950
+using device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_generic_instances_2x = std::tuple<
+    // clang-format off
+        //#################| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumGemmK|          LoopScheduler|                    Pipeline|
+        //#################|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar| Prefetch|                       |                            |
+        //#################|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector| Stage   |                       |                            |
+        //#################|      |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |         |                       |                            |
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,    64,    16,    16,     4, 16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
 
 // Compilation parameters for a[m, k] * b[k, n] = c[m, n]
 using device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#################| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumGemmK|          LoopScheduler|                    Pipeline|
         //#################|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar| Prefetch|                       |                            |
         //#################|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector| Stage   |                       |                            |
         //#################|      |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |         |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -110,6 +115,17 @@ using device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances = std::tuple<
 #endif
     // clang-format on
     >;
+// double rate mfma instances on gfx950
+using device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances_2x = std::tuple<
+    // clang-format off
+        //#################| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumGemmK|          LoopScheduler|                    Pipeline|
+        //#################|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar| Prefetch|                       |                            |
+        //#################|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector| Stage   |                       |                            |
+        //#################|      |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |         |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
 
 void add_device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances(
     std::vector<std::unique_ptr<
@@ -120,6 +136,14 @@ void add_device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances(
         instances, device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_generic_instances{});
     add_device_operation_instances(instances,
                                    device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances, device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_generic_instances_2x{});
+        add_device_operation_instances(
+            instances, device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances_2x{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
index 0216b002ac..39580485ec 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
@@ -8,6 +8,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -26,29 +27,33 @@ using S = ck::Sequence<Is...>;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
 using device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_generic_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#################| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumGemmK|          LoopScheduler|                    Pipeline|
         //#################|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar| Prefetch|                       |                            |
         //#################|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector| Stage   |                       |                            |
         //#################|      |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |         |                       |                            |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    32,    64,     4, 16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>
     // clang-format on
     >;
+// double rate mfma instances on gfx950
+using device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_generic_instances_2x = std::tuple<
+    // clang-format off
+        //#################| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumGemmK|          LoopScheduler|                    Pipeline|
+        //#################|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar| Prefetch|                       |                            |
+        //#################|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector| Stage   |                       |                            |
+        //#################|      |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |         |                       |                            |
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    32,    64,     4, 16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
 
 // Compilation parameters for a[m, k] * b[n, k] = c[m, n]
 using device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#################| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumGemmK|          LoopScheduler|                    Pipeline|
         //#################|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar| Prefetch|                       |                            |
         //#################|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector| Stage   |                       |                            |
         //#################|      |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |         |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -98,6 +103,17 @@ using device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances = std::tuple<
 #endif
     // clang-format on
     >;
+// double rate mfma instances on gfx950
+using device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances_2x = std::tuple<
+    // clang-format off
+        //#################| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumGemmK|          LoopScheduler|                    Pipeline|
+        //#################|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar| Prefetch|                       |                            |
+        //#################|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector| Stage   |                       |                            |
+        //#################|      |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |         |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
 
 void add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances(
     std::vector<std::unique_ptr<
@@ -108,6 +124,14 @@ void add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances(
         instances, device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_generic_instances{});
     add_device_operation_instances(instances,
                                    device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances, device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_generic_instances_2x{});
+        add_device_operation_instances(
+            instances, device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances_2x{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
index 6af3c2654e..cf23d01bf2 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
@@ -33,13 +33,11 @@ static constexpr auto GemmPadded  = ck::tensor_operation::device::GemmSpecializa
 template <bool Masking>
 using device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances =
     std::tuple<
-// clang-format off
+        // clang-format off
         //#######################################| ALayout| B0Layout| B1Layout| CLayout| AData| B0Data| B1Data| CData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|    MaskOut|
         //#######################################|        |         |         |        |  Type|   Type|   Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|      Upper|
         //#######################################|        |         |         |        |      |       |       |      |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|   Triangle|
         //#######################################|        |         |         |        |      |       |       |      |        |         |            |            |            |            |            |               |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |           |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault,        1,   256,    256,   128,    32,    64,    32,   8,   8,    2,   32,   32,     2,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>,
         DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault,        1,   256,    256,   128,    32,   128,    32,   8,   8,    2,   32,   32,     2,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>,
         DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault,        1,   256,    128,   256,    32,    64,    32,   8,   8,    2,   32,   32,     1,     8,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>,
@@ -61,13 +59,11 @@ using device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_
 template <bool Masking>
 using device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_irregular_k_instances =
     std::tuple<
-// clang-format off
+        // clang-format off
         //#######################################| ALayout| B0Layout| B1Layout| CLayout| AData| B0Data| B1Data| CData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|    MaskOut|
         //#######################################|        |         |         |        |  Type|   Type|   Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|      Upper|
         //#######################################|        |         |         |        |      |       |       |      |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|   Triangle|
         //#######################################|        |         |         |        |      |       |       |      |        |         |            |            |            |            |            |               |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |           |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded,        1,   256,    256,   128,    40,    64,    32,   4,   4,    2,   32,   32,     2,     4,     2,     S<2,128, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,     false,      S<2,128, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,      false,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>,
         DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded,        1,   256,    256,   128,    40,   128,    32,   4,   4,    2,   32,   32,     2,     4,     4,     S<2,128, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,     false,      S<2,128, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>,
         DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded,        1,   256,    128,   256,    40,    64,    32,   4,   4,    2,   32,   32,     1,     8,     2,     S<2,128, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,     false,      S<2,128, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,      false,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>,
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
index 8382f069d7..498bf58fb3 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
@@ -40,13 +40,11 @@ template <index_t NumDimG,
           MaskingSpecialization MaskingSpec>
 using device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances =
     std::tuple<
-// clang-format off
+        // clang-format off
         // #############################################|  NumDimG| NumDimM| NumDimN| NumDimK| NumDimO|  AData|  B0Data|  B1Data|  CData|     Acc0BiasData| Acc1BiasData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM|   ATensorSpec|  B0TensorSpec|  B1TensorSpec|   CTensorSpec| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| MaskingSpec|  D0s Bias|
         // #############################################|         |        |        |        |        |   Type|    Type|    Type|   Type|             Type|         Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization|              |              |              |              | Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            | SrcScalar|
         // #############################################|         |        |        |        |        |       |        |        |       |                 |             |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |              |              |              |              |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            | PerVector|
         // #############################################|         |        |        |        |        |       |        |        |       |                 |             |        |         |            |            |            |            |            |               |              |              |              |              |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |            |          |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec,        1>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,    64,    32,   8,   8,    2,   32,   32,     2,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,   128,    32,   8,   8,    2,   32,   32,     2,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
index b6c14d69db..744bd6456d 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
@@ -40,13 +40,11 @@ template <index_t NumDimG,
           MaskingSpecialization MaskingSpec>
 using device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances =
     std::tuple<
-// clang-format off
+        // clang-format off
         // #############################################|  NumDimG| NumDimM| NumDimN| NumDimK| NumDimO| AData| B0Data| B1Data| CData|    Acc0BiasData| Acc1BiasData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM|   ATensorSpec|  B0TensorSpec|  B1TensorSpec|   CTensorSpec| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| MaskingSpec|  D0s Bias|
         // #############################################|         |        |        |        |        |  Type|   Type|   Type|  Type|            Type|         Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization|              |              |              |              | Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            | SrcScalar|
         // #############################################|         |        |        |        |        |      |       |       |      |                |             |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |              |              |              |              |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            | PerVector|
         // #############################################|         |        |        |        |        |      |       |       |      |                |             |        |         |            |            |            |            |            |               |              |              |              |              |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |            |          |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<F16>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec,       1>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<F16>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,    64,    32,   8,   8,    2,   32,   32,     2,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<F16>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,   128,    32,   8,   8,    2,   32,   32,     2,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
index 2ce5124706..b342612d1c 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
@@ -40,13 +40,11 @@ template <index_t NumDimG,
           MaskingSpecialization MaskingSpec>
 using device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances =
     std::tuple<
-// clang-format off
+        // clang-format off
         // #############################################|  NumDimG| NumDimM| NumDimN| NumDimK| NumDimO|  AData|  B0Data|  B1Data|  CData| Acc0BiasData| Acc1BiasData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM|   ATensorSpec|  B0TensorSpec|  B1TensorSpec|   CTensorSpec| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| MaskingSpec|
         // #############################################|         |        |        |        |        |   Type|    Type|    Type|   Type|         Type|         Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization|              |              |              |              | Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            |
         // #############################################|         |        |        |        |        |       |        |        |       |             |             |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |              |              |              |              |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            |
         // #############################################|         |        |        |        |        |       |        |        |       |             |             |        |         |            |            |            |            |            |               |              |              |              |              |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |            |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,    64,    32,   8,   8,    2,   32,   32,     2,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,   128,    32,   8,   8,    2,   32,   32,     2,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
index 4e8adceb1c..6d64a2e2d6 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
@@ -8,6 +8,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -58,15 +59,26 @@ using device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    32,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    32,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           8,               S<1, 16, 1,16>,               8, MaskingSpec>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    32,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           4,               S<1, 32, 1, 8>,               8, MaskingSpec>,
-#if !defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           8,               S<1, 16, 1,16>,               8, MaskingSpec>,
-        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           4,               S<1, 32, 1, 8>,               8, MaskingSpec>,
-#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         // Padded fallback kernel
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>
         // clang-format on
         >;
 
+// instances not working on gfx950
+template <index_t NumDimG,
+          index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          index_t NumDimO,
+          MaskingSpecialization MaskingSpec>
+using device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances_part2 =
+    std::tuple<
+        // clang-format off
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           8,               S<1, 16, 1,16>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           4,               S<1, 32, 1, 8>,               8, MaskingSpec>
+        // clang-format on
+        >;
+
 void add_device_batched_gemm_masking_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances(
     std::vector<std::unique_ptr<
         DeviceBatchedGemmSoftmaxGemmPermute<2,
@@ -97,6 +109,19 @@ void add_device_batched_gemm_masking_softmax_gemm_permute_xdl_cshuffle_f16_f16_f
             1,
             1,
             MaskingSpecialization::MaskOutUpperTriangle>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances_part2<
+                2,
+                1,
+                1,
+                1,
+                1,
+                MaskingSpecialization::MaskOutUpperTriangle>{});
+    }
 }
 
 void add_device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances(
@@ -129,6 +154,19 @@ void add_device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_g
             1,
             1,
             MaskingSpecialization::MaskDisabled>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances_part2<
+                2,
+                1,
+                1,
+                1,
+                1,
+                MaskingSpecialization::MaskDisabled>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp
index c0c74ff7fb..86c17aacf0 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -8,6 +8,7 @@
 #include "ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -40,14 +41,11 @@ static constexpr auto ConvFwdOddC =
 
 // arbitrary conv
 using device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //##########################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //##########################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //##########################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //##########################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   128,   128,     4,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
@@ -63,17 +61,24 @@ using device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances = std::tuple<
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>
     // clang-format on
     >;
-
-// 1x1, pad 0
-using device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_1x1_p0_f16_instances = std::tuple<
-// clang-format off
+// double rate mfma instances on gfx950
+using device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_2x = std::tuple<
+    // clang-format off
+        //##########################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //##########################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //##########################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##########################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   128,   128,     4,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>
+    // clang-format on
+    >;
+
+// 1x1, pad 0
+using device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_1x1_p0_f16_instances = std::tuple<
+    // clang-format off
         //##########################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //##########################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //##########################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //##########################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   128,   128,     4,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
@@ -89,17 +94,24 @@ using device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_1x1_p0_f16_instances = std:
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>
     // clang-format on
     >;
-
-// 1x1, stride 1, pad 0
-using device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances = std::tuple<
-// clang-format off
+// double rate mfma instances on gfx950
+using device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_1x1_p0_f16_instances_2x = std::tuple<
+    // clang-format off
+        //##########################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //##########################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //##########################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##########################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   128,   128,     4,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>
+    // clang-format on
+    >;
+
+// 1x1, stride 1, pad 0
+using device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances = std::tuple<
+    // clang-format off
         //##########################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //##########################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //##########################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //##########################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
@@ -115,16 +127,23 @@ using device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances = s
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>
     // clang-format on
     >;
-
-using device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_odd_c_f16_instances = std::tuple<
-// clang-format off
+// double rate mfma instances on gfx950
+using device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances_2x = std::tuple<
+    // clang-format off
+        //##########################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //##########################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //##########################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##########################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>
+    // clang-format on
+    >;
+
+using device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_odd_c_f16_instances = std::tuple<
+    // clang-format off
         //##########################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //##########################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //##########################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //##########################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
@@ -145,6 +164,16 @@ using device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_odd_c_f16_instances = std::
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,   128,    64,    64,     2,  4,   32,   32,    1,    2,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>
     // clang-format on
     >;
+// double rate mfma instances on gfx950
+using device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_odd_c_f16_instances_2x = std::tuple<
+    // clang-format off
+        //##########################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //##########################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //##########################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##########################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>
+    // clang-format on
+    >;
 
 void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
     std::vector<std::unique_ptr<
@@ -159,6 +188,18 @@ void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
         instances, device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances{});
     add_device_operation_instances(
         instances, device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_odd_c_f16_instances{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances, device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_2x{});
+        add_device_operation_instances(
+            instances, device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_1x1_p0_f16_instances_2x{});
+        add_device_operation_instances(
+            instances, device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances_2x{});
+        add_device_operation_instances(
+            instances, device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_odd_c_f16_instances_2x{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp
index 715dffb7ff..3402653e84 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -8,6 +8,7 @@
 #include "ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -39,14 +40,11 @@ static constexpr auto ConvFwdOddC =
 
 // arbitrary conv
 using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //##########################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|           Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //##########################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|  GlobalMemory| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //##########################################################################################|       |        |        |        |   Operation|   Operation|   Operation| DataOperation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //##########################################################################################|       |        |        |        |            |            |            |              |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwdDefault,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwdDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwdDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwdDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
@@ -62,17 +60,24 @@ using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances = s
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwdDefault,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>
     // clang-format on
     >;
-
-// 1x1, pad 0
-using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_1x1_p0_f16_instances = std::tuple<
-// clang-format off
+// double rate mfma instances on gfx950
+using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances_2x = std::tuple<
+    // clang-format off
+        //##########################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|           Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //##########################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|  GlobalMemory| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //##########################################################################################|       |        |        |        |   Operation|   Operation|   Operation| DataOperation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##########################################################################################|       |        |        |        |            |            |            |              |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwdDefault,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>
+    // clang-format on
+    >;
+
+// 1x1, pad 0
+using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_1x1_p0_f16_instances = std::tuple<
+    // clang-format off
         //##########################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|           Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //##########################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|  GlobalMemory| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //##########################################################################################|       |        |        |        |   Operation|   Operation|   Operation| DataOperation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //##########################################################################################|       |        |        |        |            |            |            |              |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   ConvFwd1x1P0,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   ConvFwd1x1P0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   ConvFwd1x1P0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   ConvFwd1x1P0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
@@ -88,17 +93,24 @@ using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_1x1_p0_f16_instan
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   ConvFwd1x1P0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>
     // clang-format on
     >;
-
-// 1x1, stride 1, pad 0
-using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances = std::tuple<
-// clang-format off
+// double rate mfma instances on gfx950
+using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_1x1_p0_f16_instances_2x = std::tuple<
+    // clang-format off
+        //##########################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|           Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //##########################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|  GlobalMemory| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //##########################################################################################|       |        |        |        |   Operation|   Operation|   Operation| DataOperation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##########################################################################################|       |        |        |        |            |            |            |              |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   ConvFwd1x1P0,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>
+    // clang-format on
+    >;
+
+// 1x1, stride 1, pad 0
+using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances = std::tuple<
+    // clang-format off
         //##########################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|           Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //##########################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|  GlobalMemory| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //##########################################################################################|       |        |        |        |   Operation|   Operation|   Operation| DataOperation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //##########################################################################################|       |        |        |        |            |            |            |              |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwd1x1S1P0,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwd1x1S1P0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwd1x1S1P0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwd1x1S1P0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
@@ -114,17 +126,25 @@ using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_1x1_s1_p0_f16_ins
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwd1x1S1P0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>
     // clang-format on
     >;
-
-// Odd C
-using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_odd_c_f16_instances = std::tuple<
-// clang-format off
+// double rate mfma instances on gfx950
+using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances_2x =
+    std::tuple<
+        // clang-format off
+        //##########################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|           Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //##########################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|  GlobalMemory| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //##########################################################################################|       |        |        |        |   Operation|   Operation|   Operation| DataOperation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##########################################################################################|       |        |        |        |            |            |            |              |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwd1x1S1P0,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>
+        // clang-format on
+        >;
+
+// Odd C
+using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_odd_c_f16_instances = std::tuple<
+    // clang-format off
         //##########################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|           Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //##########################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|  GlobalMemory| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //##########################################################################################|       |        |        |        |   Operation|   Operation|   Operation| DataOperation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //##########################################################################################|       |        |        |        |            |            |            |              |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
@@ -145,6 +165,16 @@ using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_odd_c_f16_instanc
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,   128,    64,    64,     2,  4,   32,   32,    1,    2,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>
     // clang-format on
     >;
+// double rate mfma instances on gfx950
+using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_odd_c_f16_instances_2x = std::tuple<
+    // clang-format off
+        //##########################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|           Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //##########################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|  GlobalMemory| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //##########################################################################################|       |        |        |        |   Operation|   Operation|   Operation| DataOperation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##########################################################################################|       |        |        |        |            |            |            |              |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>
+    // clang-format on
+    >;
 
 void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances(
     std::vector<DeviceConvFwdBiasActivationPtr<PassThrough, PassThrough, AddRelu>>& instances)
@@ -158,6 +188,21 @@ void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances(
         device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances{});
     add_device_operation_instances(
         instances, device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_odd_c_f16_instances{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances, device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances_2x{});
+        add_device_operation_instances(
+            instances,
+            device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_1x1_p0_f16_instances_2x{});
+        add_device_operation_instances(
+            instances,
+            device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances_2x{});
+        add_device_operation_instances(
+            instances,
+            device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_odd_c_f16_instances_2x{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp
index 5676d77986..faac2813ba 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -8,6 +8,7 @@
 #include "ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -37,14 +38,11 @@ static constexpr auto ConvFwdOddC =
 
 // arbitrary conv
 using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //##############################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //##############################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //##############################################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //##############################################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwdDefault,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwdDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwdDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwdDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
@@ -60,17 +58,24 @@ using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instances
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwdDefault,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>
     // clang-format on
     >;
-
-// 1x1, pad 0
-using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_1x1_p0_f16_instances = std::tuple<
-// clang-format off
+// double rate mfma instances on gfx950
+using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instances_2x = std::tuple<
+    // clang-format off
+        //##############################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //##############################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //##############################################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##############################################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwdDefault,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>
+    // clang-format on
+    >;
+
+// 1x1, pad 0
+using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_1x1_p0_f16_instances = std::tuple<
+    // clang-format off
         //##############################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //##############################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //##############################################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //##############################################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,   ConvFwd1x1P0,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,   ConvFwd1x1P0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,   ConvFwd1x1P0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,   ConvFwd1x1P0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
@@ -86,17 +91,24 @@ using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_1x1_p0_f16_in
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,   ConvFwd1x1P0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>
     // clang-format on
     >;
-
-// 1x1, stride 1, pad 0
-using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances = std::tuple<
-// clang-format off
+// double rate mfma instances on gfx950
+using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_1x1_p0_f16_instances_2x = std::tuple<
+    // clang-format off
+        //##############################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //##############################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //##############################################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##############################################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,   ConvFwd1x1P0,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>
+    // clang-format on
+    >;
+
+// 1x1, stride 1, pad 0
+using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances = std::tuple<
+    // clang-format off
         //##############################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //##############################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //##############################################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //##############################################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwd1x1S1P0,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwd1x1S1P0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwd1x1S1P0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwd1x1S1P0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
@@ -112,17 +124,25 @@ using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_1x1_s1_p0_f16
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwd1x1S1P0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>
     // clang-format on
     >;
-
-// Odd C
-using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_odd_c_f16_instances = std::tuple<
-// clang-format off
+// double rate mfma instances on gfx950
+using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances_2x =
+    std::tuple<
+        // clang-format off
+        //##############################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //##############################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //##############################################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##############################################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwd1x1S1P0,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>
+        // clang-format on
+        >;
+
+// Odd C
+using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_odd_c_f16_instances = std::tuple<
+    // clang-format off
         //##############################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //##############################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //##############################################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //##############################################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
@@ -143,6 +163,16 @@ using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_odd_c_f16_ins
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,   128,    64,    64,     2,  4,   32,   32,    1,    2,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>
     // clang-format on
     >;
+// double rate mfma instances on gfx950
+using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_odd_c_f16_instances_2x = std::tuple<
+    // clang-format off
+        //##############################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //##############################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //##############################################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##############################################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>
+    // clang-format on
+    >;
 
 void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instances(
     std::vector<DeviceConvFwdBiasActivationAddPtr<PassThrough, PassThrough, AddReluAdd>>& instances)
@@ -158,6 +188,22 @@ void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instan
     add_device_operation_instances(
         instances,
         device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_odd_c_f16_instances{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instances_2x{});
+        add_device_operation_instances(
+            instances,
+            device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_1x1_p0_f16_instances_2x{});
+        add_device_operation_instances(
+            instances,
+            device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances_2x{});
+        add_device_operation_instances(
+            instances,
+            device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_odd_c_f16_instances_2x{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp
index 0d143b95ee..7fcab3ea46 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -8,6 +8,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -29,15 +30,12 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
 
 // Compilation parameters for a[m, k] * b[n, k] = c[m, n]
 using device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                     Version|
         //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -87,6 +85,17 @@ using device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances = std::tu
 #endif
     // clang-format on
     >;
+// double rate mfma instances on gfx950
+using device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances_2x = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                     Version|
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(
     std::vector<std::unique_ptr<
@@ -95,6 +104,12 @@ void add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(
 {
     add_device_operation_instances(
         instances, device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances, device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances_2x{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp
index ce58469697..5bf0e8b7eb 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp
@@ -11,6 +11,7 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -55,11 +56,6 @@ using device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances = std::tuple<
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV2<   Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,        2,   256,   256,   256,    32,   8,   8,   32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        ,
-        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
-        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,    64,    64,   128,  32,  32,   16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              4,  LoopScheduler::Default,        PipelineVersion::v1>
-#endif
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
         ,
@@ -96,6 +92,18 @@ using device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances = std::tuple<
 #endif
     // clang-format on
     >;
+// double rate mfma instances on gfx950
+template <ck::tensor_operation::device::GemmSpecialization GemmSpec>
+using device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances_2x = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,    64,    64,   128,  32,  32,   16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              4,  LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances(
     std::vector<std::unique_ptr<
@@ -110,6 +118,20 @@ void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances(
 
     add_device_operation_instances(
         instances, device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances<MNKPadding>{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances_2x<GemmDefault>{});
+
+        add_device_operation_instances(
+            instances, device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances_2x<MNPadding>{});
+
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances_2x<MNKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
index 3ebd0c5351..27240122de 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
@@ -10,6 +10,7 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -31,15 +32,12 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
 
 // Compilation parameters for a[k, m] * b[k, n] = c[m, n]
 using device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
@@ -98,6 +96,17 @@ using device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances = std::tuple<
 #endif
     // clang-format on
     >;
+// double rate mfma instances on gfx950
+using device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances_2x = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(
     std::vector<std::unique_ptr<
@@ -106,6 +115,12 @@ void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(
 {
     add_device_operation_instances(instances,
                                    device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances, device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances_2x{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
index bd0f7346f7..dd1abb6079 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
@@ -10,6 +10,7 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -31,15 +32,12 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
 
 // Compilation parameters for a[k, m] * b[n, k] = c[m, n]
 using device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
@@ -98,6 +96,17 @@ using device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances = std::tuple<
 #endif
     // clang-format on
     >;
+// double rate mfma instances on gfx950
+using device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances_2x = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(
     std::vector<std::unique_ptr<
@@ -106,6 +115,12 @@ void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(
 {
     add_device_operation_instances(instances,
                                    device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances, device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances_2x{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
index a6ac333ea3..538e99b7e1 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -11,6 +11,7 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -35,27 +36,31 @@ static constexpr auto MNPadding = ck::tensor_operation::device::GemmSpecializati
 static constexpr auto MNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
 
 using device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_generic_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,     MNKPadding,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 8>,              1,  LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,     MNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 8>,              1,  LoopScheduler::Default,        PipelineVersion::v1>
     // clang-format on
     >;
+// double rate mfma instances on gfx950
+using device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_generic_instances_2x = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,     MNKPadding,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 8>,              1,  LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
 
 // Compilation parameters for a[m, k] * b[k, n] = c[m, n]
 template <ck::tensor_operation::device::GemmSpecialization GemmSpec>
 using device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // pipeline v1, 1 wave
         DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
@@ -133,6 +138,12 @@ void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(
 
     add_device_operation_instances(
         instances, device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances<MNKPadding>{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances, device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_generic_instances_2x{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
index 852b053527..278e56f556 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -11,6 +11,7 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -34,30 +35,35 @@ static constexpr auto MNKPadding  = ck::tensor_operation::device::GemmSpecializa
 
 // Compilation parameters for a[m, k] * b[n, k] = c[m, n]
 using device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_generic_instances = std::tuple<
-// clang-format off
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,     MNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 8>,              1,  LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
+// double rate mfma instances on gfx950
+using device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_generic_instances_2x = std::tuple<
+    // clang-format off
         //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
-#if defined(CK_USE_AMD_MFMA_GFX950)
         //DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
         //DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,    64,   128,  32,  32,   16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              4,  LoopScheduler::Default,        PipelineVersion::v1>
-        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,     MNKPadding,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 8>,              1,  LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,     MNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 8>,              1,  LoopScheduler::Default,        PipelineVersion::v1>
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,     MNKPadding,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 8>,              1,  LoopScheduler::Default,        PipelineVersion::v1>
     // clang-format on
     >;
 
 template <ck::tensor_operation::device::GemmSpecialization GemmSpec>
 // Compilation parameters for a[m, k] * b[n, k] = c[m, n]
 using device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // pipeline v1, 1 wave
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
@@ -125,6 +131,12 @@ void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(
 
     add_device_operation_instances(
         instances, device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances<MNKPadding>{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances, device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_generic_instances_2x{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
index 41f6ec2bf7..caf17d55cb 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
@@ -8,6 +8,8 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
 #ifdef CK_ENABLE_INT8
 namespace ck {
 namespace tensor_operation {
@@ -47,12 +49,19 @@ using device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances =
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    32,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,             16>,
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,             16>,
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    32,    64,    64,  16,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,             16>
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        ,
+        // clang-format on
+        >;
+// double rate mfma instances on gfx950
+using device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances_2x =
+    std::tuple<
+        // clang-format off
+        //#####################| ALayout| BLayout| CLayout|  AData|  BData|  CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |   Type|   Type|   Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |       |       |       |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |       |       |       |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,   128,  32,  32,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,              16,  LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,    64,   256,  64,  64,   16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 8>,               4,  LoopScheduler::Default,        PipelineVersion::v1>
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
-       // clang-format on
+        // clang-format on
         >;
 
 void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(
@@ -62,6 +71,12 @@ void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(
 {
     add_device_operation_instances(instances,
                                    device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(instances,
+                                       device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances_2x{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_interwave_pipeline_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_interwave_pipeline_v1_instance.cpp
index 74cf837500..a64424e8ac 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_interwave_pipeline_v1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_interwave_pipeline_v1_instance.cpp
@@ -9,7 +9,8 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for a[k, m] * b[k, n] = c[m, n]
-using Instances = std::tuple<
+using Instances =
+    std::tuple<
 // clang-format off
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
@@ -17,8 +18,6 @@ using Instances = std::tuple<
         //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                       |         |
         //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|            |                       |         |
         //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                       |         |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
@@ -28,8 +27,8 @@ using Instances = std::tuple<
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>
 #endif
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 
 void add_device_gemm_xdl_f16_f16_f16_km_kn_mn_interwave_pipeline_v1_instances(
     OwnerList<InstanceNT>& instances)
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_interwave_pipeline_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_interwave_pipeline_v1_instance.cpp
index f2b28f3b40..0a0406baec 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_interwave_pipeline_v1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_interwave_pipeline_v1_instance.cpp
@@ -17,8 +17,6 @@ using Instances = std::tuple<
         //###########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                         |                           |
         //###########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|            |                         |                           |
         //###########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                         |                           |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmXdl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   64,     16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1, LoopScheduler::Interwave,        PipelineVersion::v1>
 #endif
     // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_interwave_pipeline_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_interwave_pipeline_v1_instance.cpp
index da5fefe5da..3671bea7a3 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_interwave_pipeline_v1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_interwave_pipeline_v1_instance.cpp
@@ -9,7 +9,8 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for a[k, m] * b[n, k] = c[m, n]
-using Instances = std::tuple<
+using Instances =
+    std::tuple<
 // clang-format off
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
@@ -17,8 +18,6 @@ using Instances = std::tuple<
         //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                       |                             |
         //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|            |                       |                             |
         //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                       |                             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
@@ -28,8 +27,8 @@ using Instances = std::tuple<
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>
 #endif
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 
 void add_device_gemm_xdl_f16_f16_f16_km_nk_mn_interwave_pipeline_v1_instances(
     OwnerList<InstanceNN>& instances)
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_interwave_pipeline_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_interwave_pipeline_v1_instance.cpp
index b6c03b3367..95fc8ecb46 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_interwave_pipeline_v1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_interwave_pipeline_v1_instance.cpp
@@ -17,8 +17,6 @@ using Instances = std::tuple<
         //###########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                       |                             |
         //###########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|            |                       |                             |
         //###########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                       |                             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmMNPadding,   64,    16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>
 #endif
     // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_interwave_pipeline_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_interwave_pipeline_v1_instance.cpp
index bf271cc3c3..fa53a3bf0f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_interwave_pipeline_v1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_interwave_pipeline_v1_instance.cpp
@@ -9,7 +9,8 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for a[m, k] * b[k, n] = c[m, n]
-using Instances = std::tuple<
+using Instances =
+    std::tuple<
 // clang-format off
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
@@ -17,8 +18,6 @@ using Instances = std::tuple<
         //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                       |                             |
         //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|            |                       |                             |
         //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                       |                             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
@@ -37,8 +36,8 @@ using Instances = std::tuple<
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    16,    32,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>
 #endif
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 
 void add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_interwave_pipeline_v1_instances(
     OwnerList<InstanceTT>& instances)
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_interwave_pipeline_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_interwave_pipeline_v1_instance.cpp
index 0df59933c2..c9d1913aec 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_interwave_pipeline_v1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_interwave_pipeline_v1_instance.cpp
@@ -17,8 +17,6 @@ using Instances = std::tuple<
         //###########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                       |                             |
         //###########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|            |                       |                             |
         //###########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                       |                             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,    16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>
 #endif
     // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_interwave_pipeline_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_interwave_pipeline_v1_instance.cpp
index d9260d85ab..0410eabb70 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_interwave_pipeline_v1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_interwave_pipeline_v1_instance.cpp
@@ -17,8 +17,6 @@ using Instances = std::tuple<
         //###########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                       |                             |
         //###########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|            |                       |                             |
         //###########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                       |                             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_interwave_pipeline_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_interwave_pipeline_v1_instance.cpp
index 8b98133ada..a41919aab7 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_interwave_pipeline_v1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_interwave_pipeline_v1_instance.cpp
@@ -17,8 +17,6 @@ using Instances = std::tuple<
         //###########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                       |                             |
         //###########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|            |                       |                             |
         //###########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                       |                             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,   144,     8,  8,   16,   16,    2,    9,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8,  8, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,   144,     4,  8,   16,   16,    2,    9,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/CMakeLists.txt
index aab1c4e86e..d572862884 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/CMakeLists.txt
@@ -4,16 +4,13 @@ set(GEMM_AB_SCALE_INSTANCES)
 list(APPEND GEMM_AB_SCALE_INSTANCES 
         device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp
         device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp
-        device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnpadding_instance.cpp
-        device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnkpadding_instance.cpp
         device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp
         device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp
-        device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_mnkpadding_instance.cpp
         )
 
 set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 
 add_instance_library(device_gemm_ab_scale_instance ${GEMM_AB_SCALE_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128.hpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128.hpp
index 3a7df8d974..eba9cfcb7c 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128.hpp
@@ -34,49 +34,50 @@ static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
 static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_instances = std::tuple<
+using device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_instances = std::tuple<
     // clang-format off
-        //################################| ALayout| BLayout|               DsLayout| ELayout|AData| BData|              DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block| Scale| Scale| Scale|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //################################|        |        |                       |        | Type|  Type|                Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //################################|        |        |                       |        |     |      |                    |      |        |         |   Operation|   Operation|      Operation|              |      |     M|     N|     K|      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
-        //################################|        |        |                       |        |     |      |                    |      |        |         |            |            |               |              |      |      |      |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        //################################| ALayout| BLayout|       DsLayout| ELayout|      AData|      BData|     DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block| Scale| Scale| Scale|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //################################|        |        |               |        |       Type|       Type|       Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //################################|        |        |               |        |           |           |           |      |        |         |   Operation|   Operation|      Operation|              |      |     M|     N|     K|      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //################################|        |        |               |        |           |           |           |      |        |         |            |            |               |              |      |      |      |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         
         // Compute friendly
-        // Spill in current compiler
-        // DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,   224,   256,    128, 16,  16,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        // DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,   256,   224,    128, 16,  16,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,   128,   128,   128,   128,   128,    128, 16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,   128,   128,   128,   128,    64,    128, 16,  16,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,   128,   128,   128,    64,   128,    128, 16,  16,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,   128,   128,   128,    64,    64,    128, 16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,   128,   128,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,   128,    64,   128,  16,  16,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,    64,   128,   128,  16,  16,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,    64,    64,   128,  16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
     // clang-format on
     >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
-using device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_instances = std::tuple<
+using device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_instances = std::tuple<
     // clang-format off
-        //################################| ALayout| BLayout|      DsLayout| ELayout|AData| BData|  DsData| EData| AccData| Cshuffle|           A|           B|               C|          GEMM| Block|  Scale| Scale| Scale|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
-        //################################|        |        |              |        | Type|  Type|    Type|  Type|    Type|     Type| Elementwise| Elementwise|     Elementwise|Specialization|  Size|  Block| Block| Block| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
-        //################################|        |        |              |        |     |      |        |      |        |         |   Operation|   Operation|       Operation|              |      |      M|     N|     K|      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
-        //################################|        |        |              |        |     |      |        |      |        |         |            |            |                |              |      |       |      |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+        //################################| ALayout| BLayout|      DsLayout| ELayout|AData    |     BData|      DsData| EData| AccData| Cshuffle|           A|           B|               C|          GEMM| Block|  Scale| Scale| Scale|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //################################|        |        |              |        | Type    |      Type|        Type|  Type|    Type|     Type| Elementwise| Elementwise|     Elementwise|Specialization|  Size|  Block| Block| Block| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //################################|        |        |              |        |         |          |            |      |        |         |   Operation|   Operation|       Operation|              |      |      M|     N|     K|      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //################################|        |        |              |        |         |          |            |      |        |         |            |            |                |              |      |       |      |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
 
-        // Latency friendly 
-        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,  PassThrough, PassThrough, PassThrough,     GemmSpec,   128,    128,   128,   128,    32,   16,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,      S<2, 2, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,  PassThrough, PassThrough, PassThrough,     GemmSpec,    64,    128,   128,   128,    16,   16,    128, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,      S<4, 4, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,  PassThrough, PassThrough, PassThrough,     GemmSpec,   128,    128,   128,   128,    16,   32,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,      S<4, 4, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
-        // Memory friendly
-        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,  PassThrough, PassThrough, PassThrough,     GemmSpec,   128,    128,   128,   128,   128,   32,    128, 16,  16,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
-        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,  PassThrough, PassThrough, PassThrough,     GemmSpec,   128,    128,   128,   128,   128,   16,    128, 16,  16,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<2, 2, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
-        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,  PassThrough, PassThrough, PassThrough,     GemmSpec,   128,    128,   128,   128,    64,   32,    128, 16,  16,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
-        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,  PassThrough, PassThrough, PassThrough,     GemmSpec,   128,    128,   128,   128,    64,   16,    128, 16,  16,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<2, 2, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
-        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,  PassThrough, PassThrough, PassThrough,     GemmSpec,   128,    128,   128,   128,    32,   16,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<2, 2, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
-        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,  PassThrough, PassThrough, PassThrough,     GemmSpec,    64,    128,   128,   128,    16,   16,     64, 16,  16,  16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,     S<4, 4, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
-        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,  PassThrough, PassThrough, PassThrough,     GemmSpec,    64,    128,   128,   128,    16,   16,    128, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,     S<4, 4, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
-        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,  PassThrough, PassThrough, PassThrough,     GemmSpec,   128,    128,   128,   128,    16,   32,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
-        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,  PassThrough, PassThrough, PassThrough,     GemmSpec,   128,    128,   128,   128,    16,   64,    128, 16,  16,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
-        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,  PassThrough, PassThrough, PassThrough,     GemmSpec,   128,    128,   128,   128,    32,   64,    128, 16,  16,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<8, 8, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
-        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,  PassThrough, PassThrough, PassThrough,     GemmSpec,   128,    128,   128,   128,    16,  128,    128, 16,  16,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
-        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,  PassThrough, PassThrough, PassThrough,     GemmSpec,   128,    128,   128,   128,    32,  128,    128, 16,  16,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<8, 8, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>
+        // Memory friendly 
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,   256,   128,  8,  16,  16,   16,    1,    4,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,          1,           2,                 S<1, 16, 1, 16>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,   128,   128,  8,  16,  16,   16,    1,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,          1,           2,                 S<1, 16, 1, 16>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,    64,   128,  8,  16,  16,   16,    1,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,          1,           1,                 S<1, 16, 1, 16>,            S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,   128,   256, 16,  16,  16,   16,    1,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,             16,             16,          0,          1,           2,                 S<1, 16, 1, 16>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,    64,   256, 16,  16,  16,   16,    1,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,             16,             16,          0,          1,           1,                 S<1, 16, 1, 16>,            S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,   256,   128, 16,  16,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,   128,   128, 16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,    64,   128, 16,  16,  16,   16,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,          2,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,   128,   256, 16,  16,  32,   32,    1,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,             16,             16,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,    64,   256, 16,  16,  16,   16,    2,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,             16,             16,          0,          2,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,   256,   128, 16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,   128,   128, 16,  16,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,    64,   128, 16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,   128,   256, 16,  16,  32,   32,    2,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,             16,             16,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,    64,   256, 16,  16,  32,   32,    1,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,             16,             16,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp
index ab83c7eb3e..aebffc01f2 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp
@@ -8,7 +8,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instances(
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_default_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
                                                             Col,
                                                             Tuple<>,
@@ -19,7 +19,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_i
                                                             F32,
                                                             Tuple<>,
                                                             BF16,
-                                                            128,
+                                                            1,
                                                             128,
                                                             128,
                                                             PassThrough,
@@ -28,7 +28,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_i
 {
     add_device_operation_instances(
         instances,
-        device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_instances<GemmDefault>{});
+        device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_instances<GemmDefault>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp
index dfb1bb6e2d..31fffae080 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp
@@ -8,7 +8,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instances(
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_kpadding_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
                                                             Col,
                                                             Tuple<>,
@@ -19,7 +19,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_
                                                             F32,
                                                             Tuple<>,
                                                             BF16,
-                                                            128,
+                                                            1,
                                                             128,
                                                             128,
                                                             PassThrough,
@@ -28,7 +28,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_
 {
     add_device_operation_instances(
         instances,
-        device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_instances<GemmKPadding>{});
+        device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_instances<GemmKPadding>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnkpadding_instance.cpp
deleted file mode 100644
index d2d3ebe81e..0000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnkpadding_instance.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnkpadding_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
-                                                            Col,
-                                                            Tuple<>,
-                                                            Row,
-                                                            F8,
-                                                            F32,
-                                                            F8,
-                                                            F32,
-                                                            Tuple<>,
-                                                            BF16,
-                                                            128,
-                                                            128,
-                                                            128,
-                                                            PassThrough,
-                                                            PassThrough,
-                                                            PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_instances<GemmMNKPadding>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnpadding_instance.cpp
deleted file mode 100644
index f6ce77a751..0000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnpadding_instance.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_mnpadding_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
-                                                            Col,
-                                                            Tuple<>,
-                                                            Row,
-                                                            F8,
-                                                            F32,
-                                                            F8,
-                                                            F32,
-                                                            Tuple<>,
-                                                            BF16,
-                                                            128,
-                                                            128,
-                                                            128,
-                                                            PassThrough,
-                                                            PassThrough,
-                                                            PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_instances<GemmMNPadding>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp
index e2205ad728..569911e3de 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp
@@ -8,7 +8,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instances(
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_v1_default_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
                                                             Col,
                                                             Tuple<>,
@@ -19,7 +19,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default
                                                             F32,
                                                             Tuple<>,
                                                             BF16,
-                                                            128,
+                                                            1,
                                                             128,
                                                             128,
                                                             PassThrough,
@@ -28,8 +28,8 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default
 {
     add_device_operation_instances(
         instances,
-        device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_instances<Intrawave,
-                                                                               GemmDefault>{});
+        device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_instances<Intrawave,
+                                                                             GemmDefault>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp
index 5c0a6eb00d..d1e5b6b535 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp
@@ -8,7 +8,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instances(
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_v1_kpadding_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
                                                             Col,
                                                             Tuple<>,
@@ -19,7 +19,7 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpaddin
                                                             F32,
                                                             Tuple<>,
                                                             BF16,
-                                                            128,
+                                                            1,
                                                             128,
                                                             128,
                                                             PassThrough,
@@ -28,8 +28,8 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpaddin
 {
     add_device_operation_instances(
         instances,
-        device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_instances<Intrawave,
-                                                                               GemmKPadding>{});
+        device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_instances<Intrawave,
+                                                                             GemmKPadding>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_mnkpadding_instance.cpp
deleted file mode 100644
index cc1a03b060..0000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_mnkpadding_instance.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_mnkpadding_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
-                                                            Col,
-                                                            Tuple<>,
-                                                            Row,
-                                                            F8,
-                                                            F32,
-                                                            F8,
-                                                            F32,
-                                                            Tuple<>,
-                                                            BF16,
-                                                            128,
-                                                            128,
-                                                            128,
-                                                            PassThrough,
-                                                            PassThrough,
-                                                            PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_instances<Intrawave,
-                                                                               GemmMNKPadding>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
index 9ea79b1467..33ee841354 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
@@ -10,6 +10,7 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -38,29 +39,36 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // input: a[k, m], b[k, n], d0[m, n], d1[m, n]
 using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_generic_instance =
     std::tuple<
-// clang-format off
+        // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
         // clang-format on
         >;
-using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instances =
+// double rate mfma instances on gfx950
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_generic_instance_2x =
     std::tuple<
-// clang-format off
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
+
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instances =
+    std::tuple<
+        // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -119,19 +127,28 @@ using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn
 #endif
         // clang-format on
         >;
-
-// irregular tile size
-using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_irregular_tile_instances =
+// double rate mfma instances on gfx950
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instances_2x =
     std::tuple<
-// clang-format off
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
+
+// irregular tile size
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_irregular_tile_instances =
+    std::tuple<
+        // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
@@ -145,6 +162,18 @@ using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn
 #endif
         // clang-format on
         >;
+// double rate mfma instances on gfx950
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_irregular_tile_instances_2x =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
 
 void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
@@ -168,6 +197,19 @@ void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn
     add_device_operation_instances(
         instances,
         device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_irregular_tile_instances{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_generic_instance_2x{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instances_2x{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_irregular_tile_instances_2x{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
index f6a959a447..260f9288b2 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
@@ -10,6 +10,7 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -38,29 +39,36 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // input: a[k, m], b[n, k], d0[m, n], d1[m, n]
 using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_generic_instance =
     std::tuple<
-// clang-format off
+        // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
         // clang-format on
         >;
-using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instances =
+// double rate mfma instances on gfx950
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_generic_instance_2x =
     std::tuple<
-// clang-format off
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
+
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instances =
+    std::tuple<
+        // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -119,19 +127,28 @@ using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn
 #endif
         // clang-format on
         >;
-
-// irregular tile size
-using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_irregular_tile_instances =
+// double rate mfma instances on gfx950
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instances_2x =
     std::tuple<
-// clang-format off
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
+
+// irregular tile size
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_irregular_tile_instances =
+    std::tuple<
+        // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
@@ -145,6 +162,18 @@ using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn
 #endif
         // clang-format on
         >;
+// double rate mfma instances on gfx950
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_irregular_tile_instances_2x =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
 
 void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
@@ -168,6 +197,19 @@ void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn
     add_device_operation_instances(
         instances,
         device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_irregular_tile_instances{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_generic_instance_2x{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instances_2x{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_irregular_tile_instances_2x{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
index 8a68a8bb76..8113c7013a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
@@ -10,6 +10,7 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -38,29 +39,36 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // input: a[m, k], b[k, n], d0[m, n], d1[m, n]
 using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_generic_instance =
     std::tuple<
-// clang-format off
+        // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,   128,   128,   128,    64,  16,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
         // clang-format on
         >;
-using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances =
+// double rate mfma instances on gfx950
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_generic_instance_2x =
     std::tuple<
-// clang-format off
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,   128,   128,   128,    64,  16,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
+
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances =
+    std::tuple<
+        // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -119,19 +127,28 @@ using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn
 #endif
         // clang-format on
         >;
-
-// irregular tile size
-using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_irregular_tile_instances =
+// double rate mfma instances on gfx950
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances_2x =
     std::tuple<
-// clang-format off
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
+
+// irregular tile size
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_irregular_tile_instances =
+    std::tuple<
+        // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
@@ -145,6 +162,18 @@ using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn
 #endif
         // clang-format on
         >;
+// double rate mfma instances on gfx950
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_irregular_tile_instances_2x =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
 
 void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
@@ -168,6 +197,19 @@ void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn
     add_device_operation_instances(
         instances,
         device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_irregular_tile_instances{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_generic_instance_2x{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances_2x{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_irregular_tile_instances_2x{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
index 42f7a10b9f..b94e902335 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
@@ -10,6 +10,7 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -38,29 +39,36 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // input: a[m, k], b[n, k], d0[m, n], d1[m ,n]
 using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_generic_instance =
     std::tuple<
-// clang-format off
+        // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
         // clang-format on
         >;
-using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances =
+// double rate mfma instances on gfx950
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_generic_instance_2x =
     std::tuple<
-// clang-format off
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
+
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances =
+    std::tuple<
+        // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -110,19 +118,28 @@ using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn
 #endif
         // clang-format on
         >;
-
-// irregular tile size
-using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_irregular_tile_instances =
+// double rate mfma instances on gfx950
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances_2x =
     std::tuple<
-// clang-format off
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
+
+// irregular tile size
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_irregular_tile_instances =
+    std::tuple<
+        // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
@@ -136,6 +153,18 @@ using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn
 #endif
         // clang-format on
         >;
+// double rate mfma instances on gfx950
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_irregular_tile_instances_2x =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
 
 void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
@@ -159,6 +188,19 @@ void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn
     add_device_operation_instances(
         instances,
         device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_irregular_tile_instances{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_generic_instance_2x{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances_2x{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_irregular_tile_instances_2x{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
index 6d1c16776a..b34f5e162c 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
@@ -6,6 +6,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
 #include "ck/utility/sequence.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -23,28 +24,35 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // input: a[k, m], b[k, n], d0[m, n]
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_generic_instance =
     std::tuple<
-// clang-format off
+        // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
         // clang-format on
         >;
+// double rate mfma instances on gfx950
+using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_generic_instance_2x =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
+
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|    Layout| Layout|  Type|  Type|    Type| DataType|      Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |          |       |      |      |        |         |          |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |          |       |      |      |        |         |          |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -103,19 +111,27 @@ using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instanc
 #endif
     // clang-format on
     >;
+// double rate mfma instances on gfx950
+using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances_2x = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|    Layout| Layout|  Type|  Type|    Type| DataType|      Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |          |       |      |      |        |         |          |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |          |       |      |      |        |         |          |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
 
 // irregular tile size
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_irregular_tile_instances =
     std::tuple<
-// clang-format off
+        // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
@@ -129,6 +145,18 @@ using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_irregul
 #endif
         // clang-format on
         >;
+// double rate mfma instances on gfx950
+using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_irregular_tile_instances_2x =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
 
 void add_device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
@@ -151,6 +179,19 @@ void add_device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_inst
     add_device_operation_instances(
         instances,
         device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_irregular_tile_instances{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_generic_instance_2x{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances_2x{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_irregular_tile_instances_2x{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
index 8b746781ef..befa868e28 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
@@ -6,6 +6,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
 #include "ck/utility/sequence.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -23,28 +24,35 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // input: a[k, m], b[n, k], d0[m, n], d1[m, n]
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_generic_instance =
     std::tuple<
-// clang-format off
+        // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
         // clang-format on
         >;
+// double rate mfma instances on gfx950
+using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_generic_instance_2x =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
+
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|    Layout| Layout|  Type|  Type|    Type| DataType|      Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |          |       |      |      |        |         |          |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |          |       |      |      |        |         |          |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -103,19 +111,27 @@ using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instanc
 #endif
     // clang-format on
     >;
+// double rate mfma instances on gfx950
+using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances_2x = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|    Layout| Layout|  Type|  Type|    Type| DataType|      Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |          |       |      |      |        |         |          |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |          |       |      |      |        |         |          |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
 
 // irregular tile size
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_irregular_tile_instances =
     std::tuple<
-// clang-format off
+        // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
@@ -129,6 +145,18 @@ using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_irregul
 #endif
         // clang-format on
         >;
+// double rate mfma instances on gfx950
+using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_irregular_tile_instances_2x =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
 
 void add_device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
@@ -151,6 +179,19 @@ void add_device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_inst
     add_device_operation_instances(
         instances,
         device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_irregular_tile_instances{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_generic_instance_2x{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances_2x{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_irregular_tile_instances_2x{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
index 414ab674d3..47b3d16fcd 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
@@ -6,6 +6,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
 #include "ck/utility/sequence.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -23,28 +24,35 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // input: a[m, k], b[k, n], d0[m, n], d1[m, n]
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_generic_instance =
     std::tuple<
-// clang-format off
+        // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
         // clang-format on
         >;
+// double rate mfma instances on gfx950
+using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_generic_instance_2x =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
+
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|    Layout| Layout|  Type|  Type|    Type| DataType|      Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |          |       |      |      |        |         |          |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |          |       |      |      |        |         |          |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -103,19 +111,27 @@ using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instanc
 #endif
     // clang-format on
     >;
+// double rate mfma instances on gfx950
+using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances_2x = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|    Layout| Layout|  Type|  Type|    Type| DataType|      Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |          |       |      |      |        |         |          |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |          |       |      |      |        |         |          |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
 
 // irregular tile size
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_irregular_tile_instances =
     std::tuple<
-// clang-format off
+        // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
@@ -129,6 +145,18 @@ using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_irregul
 #endif
         // clang-format on
         >;
+// double rate mfma instances on gfx950
+using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_irregular_tile_instances_2x =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
 
 void add_device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
@@ -151,6 +179,19 @@ void add_device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_inst
     add_device_operation_instances(
         instances,
         device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_irregular_tile_instances{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_generic_instance_2x{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances_2x{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_irregular_tile_instances_2x{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
index cf6a64eb26..7c9ab826ae 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
@@ -6,6 +6,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
 #include "ck/utility/sequence.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -23,28 +24,35 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // input: a[m, k], b[n, k], d0[m, n], d1[m ,n]
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_generic_instance =
     std::tuple<
-// clang-format off
+        // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
         // clang-format on
         >;
+// double rate mfma instances on gfx950
+using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_generic_instance_2x =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
+
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|    Layout| Layout|  Type|  Type|    Type| DataType|      Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |          |       |      |      |        |         |          |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |          |       |      |      |        |         |          |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -94,19 +102,27 @@ using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instanc
 #endif
     // clang-format on
     >;
+// double rate mfma instances on gfx950
+using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances_2x = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|    Layout| Layout|  Type|  Type|    Type| DataType|      Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |          |       |      |      |        |         |          |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |          |       |      |      |        |         |          |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
 
 // irregular tile size
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_irregular_tile_instances =
     std::tuple<
-// clang-format off
+        // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
@@ -120,6 +136,18 @@ using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_irregul
 #endif
         // clang-format on
         >;
+// double rate mfma instances on gfx950
+using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_irregular_tile_instances_2x =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
 
 void add_device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
@@ -142,6 +170,19 @@ void add_device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_inst
     add_device_operation_instances(
         instances,
         device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_irregular_tile_instances{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_generic_instance_2x{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances_2x{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_irregular_tile_instances_2x{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instance.cpp
index 184d7974d8..28a452c1a1 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instance.cpp
@@ -37,13 +37,11 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // input: a[k, m], b[k, n], d0[m, n], d1[m, n], gamma[n], beta[n]
 template <LoopScheduler GemmLoopScheduler, PipelineVersion GemmPipeline>
 using device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#######################################|      A|      B|            Ds|      H| AData| BData| AccData| CShuffle|        DsData| EMeanVarData| GammaData|  BetaData| HData|           A|           B|            CDE|           H|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|          PostShuffle|     PostShuffle|            Layernorm|       Layernorm|     LoopScheduler|     Pipeline|
         //#######################################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|         Type|      Type|      Type|  Type| Elementwise| Elementwise|    Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ThreadClusterLengths| ScalarPerVector| ThreadClusterLengths| ThreadSliceSize|                  |             |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |   Operation|   Operation|      Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                 _M_N|   _NWaveNPerXdl|                 _M_N|              _M|                  |             |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |            |            |               |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                     |                |                     |                |                  |             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,     AddReluAdd, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,             S<32, 8>,               8,             S<32, 8>,               1, GemmLoopScheduler, GemmPipeline>,
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,     AddReluAdd, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,             S<32, 8>,               8,             S<32, 8>,               1, GemmLoopScheduler, GemmPipeline>,
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,     AddReluAdd, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,             S<32, 8>,               8,             S<32, 8>,               1, GemmLoopScheduler, GemmPipeline>,
@@ -66,13 +64,11 @@ using device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instan
 // irregular tile size
 using device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_irregular_tile_instances =
     std::tuple<
-// clang-format off
+        // clang-format off
         //#######################################|      A|      B|            Ds|      H| AData| BData| AccData| CShuffle|        DsData| EMeanVarData| GammaData|  BetaData| HData|           A|           B|            CDE|           H|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|           PostShuffle|     PostShuffle|            Layernorm|       Layernorm|          LoopScheduler|              Pipeline|
         //#######################################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|         Type|      Type|      Type|  Type| Elementwise| Elementwise|    Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|  ThreadClusterLengths| ScalarPerVector| ThreadClusterLengths| ThreadSliceSize|                       |                      |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |   Operation|   Operation|      Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                  _M_N|   _NWaveNPerXdl|                 _M_N|              _M|                       |                      |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |            |            |               |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                      |                |                     |                |                       |                      |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // pipeline v1, 1 wave
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,     AddReluAdd, PassThrough, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<16, 4>,              1,             S<16, 4>,               1, LoopScheduler::Default,  PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instance.cpp
index ef37c82c7f..13366238d6 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instance.cpp
@@ -37,13 +37,11 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // input: a[k, m], b[k, n], d0[m, n], d1[m, n], gamma[n], beta[n]
 template <LoopScheduler GemmLoopScheduler, PipelineVersion GemmPipeline>
 using device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#######################################|      A|      B|            Ds|      H| AData| BData| AccData| CShuffle|        DsData| EMeanVarData| GammaData|  BetaData| HData|           A|           B|         CDE|           H|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|          PostShuffle|     PostShuffle|            Layernorm|       Layernorm|     LoopScheduler|     Pipeline|
         //#######################################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|         Type|      Type|      Type|  Type| Elementwise| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ThreadClusterLengths| ScalarPerVector| ThreadClusterLengths| ThreadSliceSize|                  |             |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |   Operation|   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                 _M_N|   _NWaveNPerXdl|                 _M_N|              _M|                  |             |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |            |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                     |                |                     |                |                  |             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,  AddReluAdd, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<32, 8>,               8,             S<32, 8>,               1, GemmLoopScheduler, GemmPipeline>,
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,  AddReluAdd, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<32, 8>,               8,             S<32, 8>,               1, GemmLoopScheduler, GemmPipeline>,
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,  AddReluAdd, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<32, 8>,               8,             S<32, 8>,               1, GemmLoopScheduler, GemmPipeline>,
@@ -66,13 +64,11 @@ using device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instan
 // irregular tile size
 using device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_irregular_tile_instances =
     std::tuple<
-// clang-format off
+        // clang-format off
         //#######################################|      A|      B|            Ds|      H| AData| BData| AccData| CShuffle|        DsData| EMeanVarData| GammaData|  BetaData| HData|           A|           B|         CDE|           H|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|          PostShuffle|     PostShuffle|            Layernorm|       Layernorm|          LoopScheduler|               Pipeline|
         //#######################################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|         Type|      Type|      Type|  Type| Elementwise| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ThreadClusterLengths| ScalarPerVector| ThreadClusterLengths| ThreadSliceSize|                       |                       |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |   Operation|   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                 _M_N|   _NWaveNPerXdl|                 _M_N|              _M|                       |                       |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |            |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                     |                |                     |                |                       |                       |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // pipeline v1, 1 wave
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,  AddReluAdd, PassThrough, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,              S<16, 4>,              1,             S<16, 4>,               1, LoopScheduler::Default,   PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instance.cpp
index 40fbc85be0..8a4889ee83 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instance.cpp
@@ -37,13 +37,11 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // input: a[k, m], b[k, n], d0[m, n], d1[m, n], gamma[n], beta[n]
 template <LoopScheduler GemmLoopScheduler, PipelineVersion GemmPipeline>
 using device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#######################################|      A|      B|            Ds|      H| AData| BData| AccData| CShuffle|        DsData| EMeanVarData| GammaData|  BetaData| HData|           A|           B|         CDE|           H|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|          PostShuffle|     PostShuffle|            Layernorm|       Layernorm|     LoopScheduler|     Pipeline|
         //#######################################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|         Type|      Type|      Type|  Type| Elementwise| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ThreadClusterLengths| ScalarPerVector| ThreadClusterLengths| ThreadSliceSize|                  |             |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |   Operation|   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                 _M_N|   _NWaveNPerXdl|                 _M_N|              _M|                  |             |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |            |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                     |                |                     |                |                  |             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,  AddReluAdd, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,             S<32, 8>,               8,             S<32, 8>,               1, GemmLoopScheduler, GemmPipeline>,
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,  AddReluAdd, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,             S<32, 8>,               8,             S<32, 8>,               1, GemmLoopScheduler, GemmPipeline>,
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,  AddReluAdd, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,             S<32, 8>,               8,             S<32, 8>,               1, GemmLoopScheduler, GemmPipeline>,
@@ -66,13 +64,11 @@ using device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instan
 // irregular tile size
 using device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_irregular_tile_instances =
     std::tuple<
-// clang-format off
+        // clang-format off
         //#######################################|      A|      B|            Ds|      H| AData| BData| AccData| CShuffle|        DsData| EMeanVarData| GammaData|  BetaData| HData|           A|           B|         CDE|           H|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|          PostShuffle|     PostShuffle|            Layernorm|       Layernorm|          LoopScheduler|              Pipeline|
         //#######################################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|         Type|      Type|      Type|  Type| Elementwise| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ThreadClusterLengths| ScalarPerVector| ThreadClusterLengths| ThreadSliceSize|                       |                      |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |   Operation|   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                 _M_N|   _NWaveNPerXdl|                 _M_N|              _M|                       |                      |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |            |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                     |                |                     |                |                       |                      |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // pipeline v1, 1 wave
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,  AddReluAdd, PassThrough, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<16, 4>,              1,             S<16, 4>,               1, LoopScheduler::Default,  PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instance.cpp
index 464279c376..fc3cbcf905 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instance.cpp
@@ -37,13 +37,11 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // input: a[k, m], b[k, n], d0[m, n], d1[m, n], gamma[n], beta[n]
 template <LoopScheduler GemmLoopScheduler, PipelineVersion GemmPipeline>
 using device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#######################################|      A|      B|            Ds|      H| AData| BData| AccData| CShuffle|        DsData| EMeanVarData| GammaData|  BetaData| HData|           A|           B|         CDE|           H|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|          PostShuffle|     PostShuffle|            Layernorm|       Layernorm|     LoopScheduler|     Pipeline|
         //#######################################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|         Type|      Type|      Type|  Type| Elementwise| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ThreadClusterLengths| ScalarPerVector| ThreadClusterLengths| ThreadSliceSize|                  |             |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |   Operation|   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                 _M_N|   _NWaveNPerXdl|                 _M_N|              _M|                  |             |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |            |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                     |                |                     |                |                  |             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,  AddReluAdd, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<32, 8>,               8,             S<32, 8>,               1, GemmLoopScheduler, GemmPipeline>,
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,  AddReluAdd, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<32, 8>,               8,             S<32, 8>,               1, GemmLoopScheduler, GemmPipeline>,
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,  AddReluAdd, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<16, 8>,               8,             S<16, 8>,               1, GemmLoopScheduler, GemmPipeline>,
@@ -63,13 +61,11 @@ using device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instan
 // irregular tile size
 using device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_irregular_tile_instances =
     std::tuple<
-// clang-format off
+        // clang-format off
         //#######################################|      A|      B|            Ds|      H| AData| BData| AccData| CShuffle|        DsData| EMeanVarData| GammaData|  BetaData| HData|           A|           B|         CDE|           H|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|          PostShuffle|     PostShuffle|            Layernorm|       Layernorm|          LoopScheduler|              Pipeline|
         //#######################################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|         Type|      Type|      Type|  Type| Elementwise| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ThreadClusterLengths| ScalarPerVector| ThreadClusterLengths| ThreadSliceSize|                       |                      |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |   Operation|   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                 _M_N|   _NWaveNPerXdl|                 _M_N|              _M|                       |                      |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |            |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                     |                |                     |                |                       |                      |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // pipeline v1, 1 wave
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,  AddReluAdd, PassThrough, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<16, 4>,              1,             S<16, 4>,               1, LoopScheduler::Default,  PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
index e2bf62ca94..c5e82abed6 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
@@ -6,6 +6,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
 #include "ck/utility/sequence.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -22,28 +23,34 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // outout: e[m, n]
 // input: a[k, m], b[k, n]
 using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_generic_instance = std::tuple<
-// clang-format off
+    // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
     // clang-format on
     >;
-using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances = std::tuple<
-// clang-format off
+// double rate mfma instances on gfx950
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_generic_instance_2x = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
+
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances = std::tuple<
+    // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -102,18 +109,26 @@ using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances = std::t
 #endif
     // clang-format on
     >;
-
-// irregular tile size
-using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_irregular_tile_instances = std::tuple<
-// clang-format off
+// double rate mfma instances on gfx950
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances_2x = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
+
+// irregular tile size
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_irregular_tile_instances = std::tuple<
+    // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
@@ -127,6 +142,18 @@ using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_irregular_tile_ins
 #endif
     // clang-format on
     >;
+// double rate mfma instances on gfx950
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_irregular_tile_instances_2x =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
 
 void add_device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
@@ -148,6 +175,18 @@ void add_device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(
     add_device_operation_instances(
         instances,
         device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_irregular_tile_instances{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_generic_instance_2x{});
+        add_device_operation_instances(
+            instances, device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances_2x{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_irregular_tile_instances_2x{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
index bbf2836cf8..1e65127726 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
@@ -6,6 +6,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
 #include "ck/utility/sequence.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -22,28 +23,34 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // outout: e[m, n]
 // input: a[k, m], b[k, n]
 using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_generic_instance = std::tuple<
-// clang-format off
+    // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
     // clang-format on
     >;
-using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances = std::tuple<
-// clang-format off
+// double rate mfma instances on gfx950
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_generic_instance_2x = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
+
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances = std::tuple<
+    // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -102,18 +109,26 @@ using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances = std::t
 #endif
     // clang-format on
     >;
-
-// irregular tile size
-using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_irregular_tile_instances = std::tuple<
-// clang-format off
+// double rate mfma instances on gfx950
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances_2x = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
+
+// irregular tile size
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_irregular_tile_instances = std::tuple<
+    // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
@@ -127,6 +142,18 @@ using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_irregular_tile_ins
 #endif
     // clang-format on
     >;
+// double rate mfma instances on gfx950
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_irregular_tile_instances_2x =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
 
 void add_device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
@@ -148,6 +175,18 @@ void add_device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(
     add_device_operation_instances(
         instances,
         device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_irregular_tile_instances{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_generic_instance_2x{});
+        add_device_operation_instances(
+            instances, device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances_2x{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_irregular_tile_instances_2x{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
index f22739e1a6..a46af7cb16 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -6,6 +6,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
 #include "ck/utility/sequence.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -22,28 +23,34 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // outout: e[m, n]
 // input: a[m, k], b[k, n]
 using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_generic_instance = std::tuple<
-// clang-format off
+    // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
     // clang-format on
     >;
+// double rate mfma instances on gfx950
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_generic_instance_2x = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
+
 using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //##############################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -102,18 +109,26 @@ using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances = std::t
 #endif
     // clang-format on
     >;
+// double rate mfma instances on gfx950
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances_2x = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
 
 // irregular tile size
 using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_irregular_tile_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
@@ -127,6 +142,18 @@ using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_irregular_tile_ins
 #endif
     // clang-format on
     >;
+// double rate mfma instances on gfx950
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_irregular_tile_instances_2x =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
 
 void add_device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
@@ -148,6 +175,18 @@ void add_device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(
     add_device_operation_instances(
         instances,
         device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_irregular_tile_instances{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_generic_instance_2x{});
+        add_device_operation_instances(
+            instances, device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances_2x{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_irregular_tile_instances_2x{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
index b049a5bac1..bb0ee323d8 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -6,6 +6,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
 #include "ck/utility/sequence.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -22,28 +23,34 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // outout: e[m, n]
 // input: a[m, k], b[n, k]
 using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_generic_instance = std::tuple<
-// clang-format off
+    // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
     // clang-format on
     >;
+// double rate mfma instances on gfx950
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_generic_instance_2x = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
+
 using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //##############################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -93,18 +100,26 @@ using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances = std::t
 #endif
     // clang-format on
     >;
+// double rate mfma instances on gfx950
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances_2x = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+    // clang-format on
+    >;
 
 // irregular tile size
 using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_irregular_tile_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
@@ -118,6 +133,18 @@ using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_irregular_tile_ins
 #endif
     // clang-format on
     >;
+// double rate mfma instances on gfx950
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_irregular_tile_instances_2x =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+        // clang-format on
+        >;
 
 void add_device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
@@ -139,6 +166,18 @@ void add_device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(
     add_device_operation_instances(
         instances,
         device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_irregular_tile_instances{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_generic_instance_2x{});
+        add_device_operation_instances(
+            instances, device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances_2x{});
+        add_device_operation_instances(
+            instances,
+            device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_irregular_tile_instances_2x{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/CMakeLists.txt
index 57bae7a2ac..6336833c3a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/CMakeLists.txt
@@ -10,6 +10,8 @@ list(APPEND GEMM_MULTIPLY_MULTIPLY_INSTANCES
         device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_kpadding_instance_part1.cpp
         device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_default_instance_part2.cpp
         device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_kpadding_instance_part2.cpp
+        device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_default_instance_part3.cpp
+        device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_kpadding_instance_part3.cpp
         device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
         device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
         device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
@@ -23,6 +25,8 @@ list(APPEND GEMM_MULTIPLY_MULTIPLY_INSTANCES
         device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_kpadding_instance_part1.cpp
         device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_default_instance_part2.cpp
         device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_kpadding_instance_part2.cpp
+        device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_default_instance_part3.cpp
+        device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_kpadding_instance_part3.cpp
         device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_mem_v1_default_instance.cpp
         device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp
         device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_mem_v2_default_instance.cpp
@@ -44,6 +48,8 @@ set_source_files_properties(device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_
 set_source_files_properties(device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_kpadding_instance_part1.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_default_instance_part2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_kpadding_instance_part2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_default_instance_part3.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_kpadding_instance_part3.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 
 set_source_files_properties(device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_default_instance_part1.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_kpadding_instance_part1.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
@@ -53,6 +59,8 @@ set_source_files_properties(device_gemm_multiply_multiply_xdl_f8_f8_f16/device_g
 set_source_files_properties(device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_kpadding_instance_part1.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_default_instance_part2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_kpadding_instance_part2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_default_instance_part3.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_kpadding_instance_part3.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 
 set_source_files_properties(device_gemm_multiply_multiply_xdl_i8_i8_f16/device_gemm_multiply_multiply_xdl_i8_i8_f16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_multiply_multiply_xdl_i8_i8_f16/device_gemm_multiply_multiply_xdl_i8_i8_f16_mk_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp
index 17da9ce5b7..b498aba422 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp
@@ -36,12 +36,11 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_instances_part1 = std::tuple<
-// clang-format off
+    // clang-format off
         //################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         // Compute friendly
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,   256,    64,  16,  16,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,   256,   128,  16,  16,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
@@ -60,18 +59,16 @@ using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_instances_part1
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,   128,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    96,   256,  16,  16,  32,   32,    1,    3,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 64, 1, 4>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    64,   256,  16,  16,  32,   32,    2,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
-#endif
     // clang-format on
     >;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_instances_part2 = std::tuple<
-// clang-format off
+    // clang-format off
         //################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
 
         // Compute friendly
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   192,    128,  128,  16,  16,  32,   32,    3,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
@@ -100,19 +97,17 @@ using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_instances_part2
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    16,   192,   256,  16,  16,  16,   16,    1,    3,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    16,   128,   256,  16,  16,  16,   16,    1,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    16,    64,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
-#endif
     // clang-format on
     >;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_instances_part1 =
     std::tuple<
-// clang-format off
+        // clang-format off
         //################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         // Compute friendly
         // 256x[64, 256, 32]x128
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,   256,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
@@ -125,7 +120,20 @@ using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_insta
         // 224x[64, 256, 32]x128
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   224,   256,   128,  16,  16,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   224,   224,   128,  16,  16,  16,   16,    7,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   224,   192,   128,  16,  16,  16,   16,    7,    6,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   224,   192,   128,  16,  16,  16,   16,    7,    6,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_instances_part2 =
+    std::tuple<
+        // clang-format off
+        //################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        // Compute friendly
+        // 224x[64, 256, 32]x128
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   224,   160,   128,  16,  16,  16,   16,    7,    5,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   224,   128,   128,  16,  16,  16,   16,    7,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   224,    96,   128,  16,  16,  16,   16,    7,    3,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
@@ -139,19 +147,17 @@ using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_insta
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   192,    96,   128,  16,  16,  16,   16,    6,    3,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   192,    64,   128,  16,  16,  16,   16,    6,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   192,    64,   256,  16,  16,  16,   16,    6,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
-#endif
         // clang-format on
         >;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_instances_part2 =
+using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_instances_part3 =
     std::tuple<
-// clang-format off
+        // clang-format off
         //################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         // Compute friendly
 
         // 160x[64, 256, 32]x128, 160x[64, 96, 32]x256
@@ -164,6 +170,11 @@ using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_insta
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   160,    64,   128,  16,  16,  16,   16,    5,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   160,    96,   256,  16,  16,  16,   16,    5,    3,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   160,    64,   256,  16,  16,  16,   16,    5,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        // 144x[64, 256, 32]x128
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   144,    64,   128,   8,  16,  16,   16,    9,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1,16>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   144,   128,   128,   8,  16,  16,   16,    9,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1,16>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   144,   192,   128,   8,  16,  16,   16,    9,    3,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1,16>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   144,   256,   128,   8,  16,  16,   16,    9,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1,16>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         // 128x[64, 256, 32]x128, 128x[64, 128, 32]x256
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,   256,   128,  16,  16,  16,   16,    4,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,   224,   128,  16,  16,  16,   16,    4,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
@@ -175,18 +186,16 @@ using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_insta
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,   128,   256,  16,  16,  16,   16,    4,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    96,   256,  16,  16,  16,   16,    4,    3,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    64,   256,  16,  16,  16,   16,    4,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
-#endif
         // clang-format on
         >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_mem_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|               C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|     Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|       Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //################################|        |        |                 |        |     |      |                |      |        |         |            |            |                |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         // Latency friendly 
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     F8,     F8,    Tuple<F32, F32>, BF16,   F32,     F32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,   128,    32,   16,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,      S<2, 2, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     F8,     F8,    Tuple<F32, F32>, BF16,   F32,     F32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,    64,    16,   16,    128, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,      S<4, 4, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
@@ -222,8 +231,7 @@ using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_mem_instances = std:
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     F8,     F8,    Tuple<F32, F32>, BF16,   F32,     F32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,   256,    32,  128,    128, 16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,      S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     F8,     F8,    Tuple<F32, F32>, BF16,   F32,     F32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,   256,    32,  128,    256, 16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,      S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
 
-// clang-format on
-#endif
+    // clang-format on
     >;
 } // namespace instance
 } // namespace device
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_default_instance_part3.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_default_instance_part3.cpp
new file mode 100644
index 0000000000..0e5b0ff5d7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_default_instance_part3.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_default_instances_part3(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Tuple<Row, Col>,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          Tuple<F32, F32>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_instances_part3<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_kpadding_instance_part3.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_kpadding_instance_part3.cpp
new file mode 100644
index 0000000000..960a597659
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_kpadding_instance_part3.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_kpadding_instances_part3(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Tuple<Row, Col>,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          Tuple<F32, F32>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_mfma16x16_instances_part3<
+            GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn.hpp
index 5c854ee5d9..eb473e0115 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn.hpp
@@ -36,12 +36,11 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_instances_part1 = std::tuple<
-// clang-format off
+    // clang-format off
         //################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         // Compute friendly
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,   256,    64,  16,  16,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,   256,   128,  16,  16,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
@@ -59,18 +58,16 @@ using device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_instances_part1
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,   128,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    96,   256,  16,  16,  32,   32,    1,    3,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 64, 1, 4>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    64,   256,  16,  16,  32,   32,    2,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
-#endif
     // clang-format on
     >;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_instances_part2 = std::tuple<
-// clang-format off
+    // clang-format off
         //################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
 
         // Compute friendly
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,   256,   128,  16,  16,  32,   32,    1,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
@@ -92,19 +89,17 @@ using device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_instances_part2
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    16,   192,   256,  16,  16,  16,   16,    1,    3,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    16,   128,   256,  16,  16,  16,   16,    1,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    16,    64,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
-#endif
     // clang-format on
     >;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_instances_part1 =
     std::tuple<
-// clang-format off
+        // clang-format off
         //################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         // Compute friendly
         // 256x[64, 256, 32]x128
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,   256,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
@@ -117,7 +112,20 @@ using device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_instan
         // 224x[64, 256, 32]x128
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   224,   256,   128,  16,  16,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   224,   224,   128,  16,  16,  16,   16,    7,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   224,   192,   128,  16,  16,  16,   16,    7,    6,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   224,   192,   128,  16,  16,  16,   16,    7,    6,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_instances_part2 =
+    std::tuple<
+        // clang-format off
+        //################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        // Compute friendly
+        // 224x[64, 256, 32]x128
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   224,   160,   128,  16,  16,  16,   16,    7,    5,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   224,   128,   128,  16,  16,  16,   16,    7,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   224,    96,   128,  16,  16,  16,   16,    7,    3,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
@@ -131,19 +139,17 @@ using device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_instan
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   192,    96,   128,  16,  16,  16,   16,    6,    3,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   192,    64,   128,  16,  16,  16,   16,    6,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   192,    64,   256,  16,  16,  16,   16,    6,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
-#endif
         // clang-format on
         >;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_instances_part2 =
+using device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_instances_part3 =
     std::tuple<
-// clang-format off
+        // clang-format off
         //################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         // Compute friendly
 
         // 160x[64, 256, 32]x128, 160x[64, 96, 32]x256
@@ -167,18 +173,16 @@ using device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_instan
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,   128,   256,  16,  16,  16,   16,    4,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    96,   256,  16,  16,  16,   16,    4,    3,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    64,   256,  16,  16,  16,   16,    4,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
-#endif
         // clang-format on
         >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_mem_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|               C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|     Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|       Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //################################|        |        |                 |        |     |      |                |      |        |         |            |            |                |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         // Latency friendly 
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     F8,     F8,    Tuple<F32, F32>, F16,   F32,     F32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,   128,    32,   16,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,      S<2, 2, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     F8,     F8,    Tuple<F32, F32>, F16,   F32,     F32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,    64,    16,   16,    128, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,      S<4, 4, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
@@ -214,8 +218,7 @@ using device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_mem_instances = std::
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     F8,     F8,    Tuple<F32, F32>, F16,   F32,     F32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,   256,    32,  128,    128, 16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,      S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>, Row,     F8,     F8,    Tuple<F32, F32>, F16,   F32,     F32,  PassThrough, PassThrough, MultiplyMultiply,     GemmSpec,   256,    32,  128,    256, 16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,      S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
 
-// clang-format on
-#endif
+    // clang-format on
     >;
 } // namespace instance
 } // namespace device
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_default_instance_part3.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_default_instance_part3.cpp
new file mode 100644
index 0000000000..858713f2be
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_default_instance_part3.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_default_instances_part3(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Tuple<Row, Col>,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          Tuple<F32, F32>,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_instances_part3<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_kpadding_instance_part3.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_kpadding_instance_part3.cpp
new file mode 100644
index 0000000000..6428802cc3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_kpadding_instance_part3.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_kpadding_instances_part3(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Tuple<Row, Col>,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          Tuple<F32, F32>,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_mfma16x16_instances_part3<
+            GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/CMakeLists.txt
deleted file mode 100644
index 943b2bf4c7..0000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/CMakeLists.txt
+++ /dev/null
@@ -1,42 +0,0 @@
-# ONLY XDL_KERNELS
-set(GEMM_MULTIPLY_MULTIPLY_WEIGHT_PRESHUFFLE_INSTANCES)
-
-list(APPEND GEMM_MULTIPLY_MULTIPLY_WEIGHT_PRESHUFFLE_INSTANCES 
-        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance.cpp
-        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance.cpp
-        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance.cpp
-        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance_v2.cpp
-        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance_v2.cpp
-        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance_v2.cpp
-        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance.cpp
-        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance.cpp
-
-        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance.cpp
-        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance.cpp
-        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance.cpp
-        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance_v2.cpp
-        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance_v2.cpp
-        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance_v2.cpp
-        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance.cpp
-        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance.cpp
-        )
-
-set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-
-set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-
-add_instance_library(device_gemm_multiply_multiply_weight_preshuffle_instance ${GEMM_MULTIPLY_MULTIPLY_WEIGHT_PRESHUFFLE_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/CMakeLists.txt
new file mode 100644
index 0000000000..37233ac5b4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/CMakeLists.txt
@@ -0,0 +1,82 @@
+# ONLY XDL_KERNELS
+set(GEMM_MULTIPLY_MULTIPLY_WEIGHT_PRESHUFFLE_INSTANCES)
+
+list(APPEND GEMM_MULTIPLY_MULTIPLY_WEIGHT_PRESHUFFLE_INSTANCES 
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instance.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instance.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance_v2.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance_v2.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance_v2.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instance_v2.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instance_v2.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance_p1.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance_p2.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p1.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p2.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p3.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p4.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p5.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p6.cpp
+
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance_v2.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance_v2.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance_v2.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance_v2.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance_v2.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p1.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p2.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p1.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p2.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p3.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p4.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p5.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p6.cpp
+        )
+
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance_p1.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance_p2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p1.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p3.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p4.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p5.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p6.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p1.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p1.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p3.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p4.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p5.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p6.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
+add_instance_library(device_gemm_multiply_multiply_wp_instance ${GEMM_MULTIPLY_MULTIPLY_WEIGHT_PRESHUFFLE_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p1.cpp
similarity index 88%
rename from library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance.cpp
rename to library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p1.cpp
index e3ff079d99..7bb36cf9f5 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p1.cpp
@@ -1,14 +1,14 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances(
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p1(
     std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
                                                                      Col,
                                                                      Tuple<Row, Col>,
@@ -23,7 +23,7 @@ void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma1
 {
     add_device_operation_instances(
         instances,
-        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_instances<
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_instances_p1<
             GemmDefault>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p2.cpp
similarity index 88%
rename from library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance.cpp
rename to library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p2.cpp
index 71383f5dc1..e641215793 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p2.cpp
@@ -1,14 +1,14 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instances(
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p2(
     std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
                                                                      Col,
                                                                      Tuple<Row, Col>,
@@ -23,7 +23,7 @@ void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_
 {
     add_device_operation_instances(
         instances,
-        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_instances<
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_instances_p2<
             GemmDefault>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p3.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p3.cpp
new file mode 100644
index 0000000000..cd9c8564a7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p3.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p3(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_instances_p3<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p4.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p4.cpp
new file mode 100644
index 0000000000..723bbd0aa1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p4.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p4(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_instances_p4<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p5.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p5.cpp
new file mode 100644
index 0000000000..2ea8a8a8be
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p5.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p5(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_instances_p5<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p6.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p6.cpp
new file mode 100644
index 0000000000..696b4616aa
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p6.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p6(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_instances_p6<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp
similarity index 57%
rename from library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn.hpp
rename to library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp
index a138452295..e5ada03a46 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp
@@ -40,107 +40,135 @@ static constexpr auto v2 = BlockGemmPipelineVersion::v2;
 template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
 using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_instances =
     std::tuple<
-// clang-format off
-        //################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
-        //################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,    128,   128,  16,  16,  32,   32,    8,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    128,   128,  16,  16,  32,   32,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   128,  16,  16,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   128,  16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         // N 256
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,    256,   128,  16,  16,  32,   32,    8,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    256,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   128,  16,  16,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         // N 512
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    512,   128,  16,  16,  32,   32,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    512,   128,  16,  16,  32,   32,    1,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
-#endif
         // clang-format on
         >;
 
 template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
 using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_instances =
     std::tuple<
-// clang-format off
-        //################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
-        //################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    128,   256,  16,  16,  32,   32,    4,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   256,  16,  16,  32,   32,    2,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   256,  16,  16,  32,   32,    1,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   256,  16,  16,  32,   32,    2,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   256,  16,  16,  32,   32,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   512,  16,  16,  32,   32,    2,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   512,  16,  16,  32,   32,    1,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        // N 256
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    256,   256,  16,  16,  32,   32,    4,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   256,  16,  16,  32,   32,    2,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   256,  16,  16,  32,   32,    1,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   512,  16,  16,  32,   32,    2,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   512,  16,  16,  32,   32,    1,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        // N 512
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    512,   256,  16,  16,  32,   32,    2,    4,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    512,   256,  16,  16,  32,   32,    1,    4,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
-#endif
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   512,  16,  16,  32,   32,    1,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
         // clang-format on
         >;
 
 template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
 using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_instances =
     std::tuple<
-// clang-format off
-        //################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
-        //################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   256,  16,  16,  16,   16,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    128,   256,  16,  16,  16,   16,    1,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    256,   256,  16,  16,  16,   16,    1,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    512,   256,  16,  16,  16,   16,    1,    8,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        // N 256
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   256,  16,  16,  32,   32,    2,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   256,  16,  16,  32,   32,    1,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   512,  16,  16,  32,   32,    2,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   512,  16,  16,  32,   32,    1,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        // N 512
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    512,   256,  16,  16,  32,   32,    2,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    512,   256,  16,  16,  32,   32,    1,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+        // clang-format on
+        >;
+
+template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p4_instances =
+    std::tuple<
+        // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    128,   512,  16,  16,  16,   16,    1,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    256,   512,  16,  16,  16,   16,    1,    4,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
-#endif
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   128,   512,  16,  16,  16,   16,    1,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   256,   512,  16,  16,  16,   16,    1,    4,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   128,     32,    16,   512,  16,  16,  16,   16,    1,    1,     S<32, 4, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 4>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   128,     16,    32,   128,  16,  16,  16,   16,    1,    1,     S<8, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   128,     16,    32,   512,  16,  16,  16,   16,    1,    1,     S<32, 4, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     32,    64,   512,  16,  16,  16,   16,    1,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     64,    64,   512,  16,  16,  32,   32,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     64,    16,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 64, 1, 4>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     64,    16,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 64, 1, 4>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+
+        // clang-format on
+        >;
+
+template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p5_instances =
+    std::tuple<
+        // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   256,  16,  16,  16,   16,    1,    1,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   128,   256,  16,  16,  16,   16,    1,    2,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   256,   256,  16,  16,  16,   16,    1,    4,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   512,   256,  16,  16,  16,   16,    1,    8,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
         // clang-format on
         >;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_instances =
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_instances_p1 =
     std::tuple<
-// clang-format off
-        //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
-        //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    256,   128,  16,  16,  32,   32,    4,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    256,   128,  16,  16,  32,   32,    7,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    256,   128,  16,  16,  32,   32,    6,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    256,   128,  16,  16,  32,   32,    5,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    256,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    256,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_instances_p2 =
+    std::tuple<
+        // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    128,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    128,   128,  16,  16,  32,   32,    7,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    128,   128,  16,  16,  32,   32,    6,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    128,   128,  16,  16,  32,   32,    5,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   128,  16,  16,  32,   32,    4,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
-#endif
         // clang-format on
         >;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_instances =
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_instances_p1 =
     std::tuple<
-// clang-format off
+        // clang-format off
         //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         // Compute friendly
         // 256x[64, 256, 32]x128
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    256,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
@@ -149,7 +177,18 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x1
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    160,   128,  16,  16,  16,   16,    8,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    128,   128,  16,  16,  16,   16,    8,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,     96,   128,  16,  16,  16,   16,    8,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,     64,   128,  16,  16,  16,   16,    8,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,     64,   128,  16,  16,  16,   16,    8,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_instances_p2 =
+    std::tuple<
+        // clang-format off
+        //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         // 224x[64, 256, 32]x128
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    256,   128,  16,  16,  16,   16,    7,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    224,   128,  16,  16,  16,   16,    7,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
@@ -157,7 +196,17 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x1
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    160,   128,  16,  16,  16,   16,    7,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    128,   128,  16,  16,  16,   16,    7,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,     96,   128,  16,  16,  16,   16,    7,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,     64,   128,  16,  16,  16,   16,    7,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,     64,   128,  16,  16,  16,   16,    7,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+template <GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_instances_p3 =
+    std::tuple<
+        // clang-format off
+        //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         // 192x[64, 256, 32]x128, 192x[64]x256
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    256,   128,  16,  16,  16,   16,    6,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    224,   128,  16,  16,  16,   16,    6,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
@@ -165,7 +214,17 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x1
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    160,   128,  16,  16,  16,   16,    6,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    128,   128,  16,  16,  16,   16,    6,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,     96,   128,  16,  16,  16,   16,    6,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,     64,   128,  16,  16,  16,   16,    6,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,     64,   128,  16,  16,  16,   16,    6,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+template <GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_instances_p4 =
+    std::tuple<
+        // clang-format off
+        //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         // 160x[64, 256, 32]x128, 160x[64, 96, 32]x256
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    256,   128,  16,  16,  16,   16,    5,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    224,   128,  16,  16,  16,   16,    5,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
@@ -173,19 +232,38 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x1
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    160,   128,  16,  16,  16,   16,    5,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    128,   128,  16,  16,  16,   16,    5,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,     96,   128,  16,  16,  16,   16,    5,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,     64,   128,  16,  16,  16,   16,    5,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        // 128x[64, 256, 32]x128, 128x[64, 128, 32]x256
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    256,   128,  16,  16,  16,   16,    4,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    224,   128,  16,  16,  16,   16,    4,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    192,   128,  16,  16,  16,   16,    4,    6,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    160,   128,  16,  16,  16,   16,    4,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   128,  16,  16,  16,   16,    4,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,     64,   128,  16,  16,  16,   16,    5,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+template <GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_instances_p5 =
+    std::tuple<
+        // clang-format off
+        //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     96,   128,  16,  16,  16,   16,    4,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     64,   128,  16,  16,  16,   16,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   256,  16,  16,  16,   16,    4,    4,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     96,   256,  16,  16,  16,   16,    4,    3,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     64,   256,  16,  16,  16,   16,    4,    2,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
-#endif
+        // clang-format on
+        >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_instances_p6 =
+    std::tuple<
+        // clang-format off
+        //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    256,   128,  16,  16,  16,   16,    4,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    224,   128,  16,  16,  16,   16,    4,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    192,   128,  16,  16,  16,   16,    4,    6,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    160,   128,  16,  16,  16,   16,    4,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   128,  16,  16,  16,   16,    4,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
         // clang-format on
         >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance_p1.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance_p1.cpp
new file mode 100644
index 0000000000..c149a54ee6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance_p1.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instances_p1(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_instances_p1<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance_p2.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance_p2.cpp
new file mode 100644
index 0000000000..b5390e5c8d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance_p2.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instances_p2(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_instances_p2<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance.cpp
rename to library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance.cpp
index 6e9b3ea172..450232177f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance.cpp
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
 
 namespace ck {
 namespace tensor_operation {
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance_v2.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance_v2.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance_v2.cpp
rename to library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance_v2.cpp
index cc543b19c1..7d5457a42e 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance_v2.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance_v2.cpp
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
 
 namespace ck {
 namespace tensor_operation {
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance.cpp
rename to library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance.cpp
index 8557d0c80e..89490f0cfb 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance.cpp
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
 
 namespace ck {
 namespace tensor_operation {
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance_v2.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance_v2.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance_v2.cpp
rename to library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance_v2.cpp
index 9fcce478e7..553e2e0bfc 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance_v2.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance_v2.cpp
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
 
 namespace ck {
 namespace tensor_operation {
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance.cpp
rename to library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance.cpp
index 84c2c70e35..410f0f47e0 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance.cpp
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
 
 namespace ck {
 namespace tensor_operation {
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance_v2.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance_v2.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance_v2.cpp
rename to library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance_v2.cpp
index 0933b1fe18..168be2fa7d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance_v2.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance_v2.cpp
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
 
 namespace ck {
 namespace tensor_operation {
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instance.cpp
new file mode 100644
index 0000000000..d25d23e3d6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instance.cpp
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p4_instances<
+            v1,
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instance_v2.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instance_v2.cpp
new file mode 100644
index 0000000000..a5e5961942
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instance_v2.cpp
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instances_v2(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p4_instances<
+            v2,
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instance.cpp
new file mode 100644
index 0000000000..9218431d19
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instance.cpp
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p5_instances<
+            v1,
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instance_v2.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instance_v2.cpp
new file mode 100644
index 0000000000..0767db101c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instance_v2.cpp
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instances_v2(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p5_instances<
+            v2,
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p1.cpp
similarity index 88%
rename from library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance.cpp
rename to library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p1.cpp
index 05529f9cdd..4d6502fff0 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p1.cpp
@@ -1,14 +1,14 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances(
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p1(
     std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
                                                                      Col,
                                                                      Tuple<Row, Col>,
@@ -23,7 +23,7 @@ void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16
 {
     add_device_operation_instances(
         instances,
-        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances<
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances_p1<
             GemmDefault>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p2.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p2.cpp
new file mode 100644
index 0000000000..6070e46a70
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p2.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p2(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances_p2<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p3.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p3.cpp
new file mode 100644
index 0000000000..ba81254ade
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p3.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p3(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances_p3<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p4.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p4.cpp
new file mode 100644
index 0000000000..acc420568b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p4.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p4(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances_p4<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p5.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p5.cpp
new file mode 100644
index 0000000000..af51f745b6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p5.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p5(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances_p5<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p6.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p6.cpp
new file mode 100644
index 0000000000..864fb03176
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p6.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p6(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances_p6<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp
similarity index 58%
rename from library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp
rename to library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp
index c4f53e834a..dc9db8889a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp
@@ -40,107 +40,131 @@ static constexpr auto v2 = BlockGemmPipelineVersion::v2;
 template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
 using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_instances =
     std::tuple<
-// clang-format off
-        //################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
-        //################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,    128,   128,  16,  16,  32,   32,    8,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    128,   128,  16,  16,  32,   32,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   128,  16,  16,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   128,  16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         // N 256
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,    256,   128,  16,  16,  32,   32,    8,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    256,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   128,  16,  16,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         // N 512
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    512,   128,  16,  16,  32,   32,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    512,   128,  16,  16,  32,   32,    1,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
-#endif
         // clang-format on
         >;
 
 template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
 using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_instances =
     std::tuple<
-// clang-format off
-        //################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
-        //################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    128,   256,  16,  16,  32,   32,    4,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   256,  16,  16,  32,   32,    2,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   256,  16,  16,  32,   32,    1,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   256,  16,  16,  32,   32,    2,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   256,  16,  16,  32,   32,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   512,  16,  16,  32,   32,    2,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   512,  16,  16,  32,   32,    1,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        // N 256
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    256,   256,  16,  16,  32,   32,    4,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   256,  16,  16,  32,   32,    2,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   256,  16,  16,  32,   32,    1,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   512,  16,  16,  32,   32,    2,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   512,  16,  16,  32,   32,    1,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        // N 512
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    512,   256,  16,  16,  32,   32,    2,    4,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    512,   256,  16,  16,  32,   32,    1,    4,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
-#endif
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   512,  16,  16,  32,   32,    1,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
         // clang-format on
         >;
 
 template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
 using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_instances =
     std::tuple<
-// clang-format off
-        //################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
-        //################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   256,  16,  16,  16,   16,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    128,   256,  16,  16,  16,   16,    1,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    256,   256,  16,  16,  16,   16,    1,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    512,   256,  16,  16,  16,   16,    1,    8,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        // N 256
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   256,  16,  16,  32,   32,    2,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   256,  16,  16,  32,   32,    1,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   512,  16,  16,  32,   32,    2,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   512,  16,  16,  32,   32,    1,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        // N 512
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    512,   256,  16,  16,  32,   32,    2,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    512,   256,  16,  16,  32,   32,    1,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+        // clang-format on
+        >;
+
+template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_instances =
+    std::tuple<
+        // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    128,   512,  16,  16,  16,   16,    1,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    256,   512,  16,  16,  16,   16,    1,    4,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
-#endif
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   128,   512,  16,  16,  16,   16,    1,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   256,   512,  16,  16,  16,   16,    1,    4,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+        // clang-format on
+        >;
+
+template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_instances =
+    std::tuple<
+        // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   256,  16,  16,  16,   16,    1,    1,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   128,   256,  16,  16,  16,   16,    1,    2,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   256,   256,  16,  16,  16,   16,    1,    4,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   512,   256,  16,  16,  16,   16,    1,    8,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   128,     16,    32,   512,  16,  16,  16,   16,    1,    1,     S<32, 4, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   128,     16,    32,   128,  16,  16,  16,   16,    1,    1,     S<8, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   128,     16,    32,   256,  16,  16,  16,   16,    1,    1,     S<16, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   128,   8,  16,  16,   16,    1,    1,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+
         // clang-format on
         >;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_instances =
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_instances_p1 =
     std::tuple<
-// clang-format off
-        //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
-        //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    256,   128,  16,  16,  32,   32,    4,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    256,   128,  16,  16,  32,   32,    7,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    256,   128,  16,  16,  32,   32,    6,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    256,   128,  16,  16,  32,   32,    5,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    256,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    256,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_instances_p2 =
+    std::tuple<
+        // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    128,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    128,   128,  16,  16,  32,   32,    7,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    128,   128,  16,  16,  32,   32,    6,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    128,   128,  16,  16,  32,   32,    5,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   128,  16,  16,  32,   32,    4,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
-#endif
         // clang-format on
         >;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances =
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances_p1 =
     std::tuple<
-// clang-format off
+        // clang-format off
         //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         // Compute friendly
         // 256x[64, 256, 32]x128
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    256,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
@@ -149,7 +173,18 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    160,   128,  16,  16,  16,   16,    8,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    128,   128,  16,  16,  16,   16,    8,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,     96,   128,  16,  16,  16,   16,    8,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,     64,   128,  16,  16,  16,   16,    8,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,     64,   128,  16,  16,  16,   16,    8,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances_p2 =
+    std::tuple<
+        // clang-format off
+        //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         // 224x[64, 256, 32]x128
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    256,   128,  16,  16,  16,   16,    7,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    224,   128,  16,  16,  16,   16,    7,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
@@ -157,7 +192,17 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    160,   128,  16,  16,  16,   16,    7,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    128,   128,  16,  16,  16,   16,    7,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,     96,   128,  16,  16,  16,   16,    7,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,     64,   128,  16,  16,  16,   16,    7,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,     64,   128,  16,  16,  16,   16,    7,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+template <GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances_p3 =
+    std::tuple<
+        // clang-format off
+        //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         // 192x[64, 256, 32]x128, 192x[64]x256
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    256,   128,  16,  16,  16,   16,    6,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    224,   128,  16,  16,  16,   16,    6,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
@@ -165,7 +210,17 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    160,   128,  16,  16,  16,   16,    6,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    128,   128,  16,  16,  16,   16,    6,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,     96,   128,  16,  16,  16,   16,    6,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,     64,   128,  16,  16,  16,   16,    6,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,     64,   128,  16,  16,  16,   16,    6,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+template <GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances_p4 =
+    std::tuple<
+        // clang-format off
+        //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         // 160x[64, 256, 32]x128, 160x[64, 96, 32]x256
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    256,   128,  16,  16,  16,   16,    5,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    224,   128,  16,  16,  16,   16,    5,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
@@ -173,19 +228,38 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    160,   128,  16,  16,  16,   16,    5,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    128,   128,  16,  16,  16,   16,    5,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,     96,   128,  16,  16,  16,   16,    5,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,     64,   128,  16,  16,  16,   16,    5,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        // 128x[64, 256, 32]x128, 128x[64, 128, 32]x256
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    256,   128,  16,  16,  16,   16,    4,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    224,   128,  16,  16,  16,   16,    4,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    192,   128,  16,  16,  16,   16,    4,    6,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    160,   128,  16,  16,  16,   16,    4,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   128,  16,  16,  16,   16,    4,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,     64,   128,  16,  16,  16,   16,    5,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+template <GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances_p5 =
+    std::tuple<
+        // clang-format off
+        //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     96,   128,  16,  16,  16,   16,    4,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     64,   128,  16,  16,  16,   16,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   256,  16,  16,  16,   16,    4,    4,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     96,   256,  16,  16,  16,   16,    4,    3,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     64,   256,  16,  16,  16,   16,    4,    2,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
-#endif
+        // clang-format on
+        >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances_p6 =
+    std::tuple<
+        // clang-format off
+        //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    256,   128,  16,  16,  16,   16,    4,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    224,   128,  16,  16,  16,   16,    4,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    192,   128,  16,  16,  16,   16,    4,    6,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    160,   128,  16,  16,  16,   16,    4,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   128,  16,  16,  16,   16,    4,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
         // clang-format on
         >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p1.cpp
similarity index 89%
rename from library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance.cpp
rename to library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p1.cpp
index c123a0fdd8..e89d2c1f89 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p1.cpp
@@ -1,14 +1,14 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances(
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p1(
     std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
                                                                      Col,
                                                                      Tuple<Row, Col>,
@@ -23,7 +23,7 @@ void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_m
 {
     add_device_operation_instances(
         instances,
-        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_instances<
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_instances_p1<
             GemmDefault>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p2.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p2.cpp
new file mode 100644
index 0000000000..a551342dd8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p2.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p2(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_instances_p2<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance.cpp
rename to library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance.cpp
index cb15688e6f..677d0ce58f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance.cpp
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp"
 
 namespace ck {
 namespace tensor_operation {
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance_v2.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance_v2.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance_v2.cpp
rename to library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance_v2.cpp
index c5a8448b59..3167098ba7 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance_v2.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance_v2.cpp
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp"
 
 namespace ck {
 namespace tensor_operation {
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance.cpp
rename to library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance.cpp
index c9ab9c1071..42d6020693 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance.cpp
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp"
 
 namespace ck {
 namespace tensor_operation {
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance_v2.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance_v2.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance_v2.cpp
rename to library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance_v2.cpp
index bb83bacb35..5e89c2623e 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance_v2.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance_v2.cpp
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp"
 
 namespace ck {
 namespace tensor_operation {
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance.cpp
rename to library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance.cpp
index fb43347ceb..2a19568557 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance.cpp
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp"
 
 namespace ck {
 namespace tensor_operation {
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance_v2.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance_v2.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance_v2.cpp
rename to library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance_v2.cpp
index c8ff03d6ef..25b98b629a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance_v2.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance_v2.cpp
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp"
 
 namespace ck {
 namespace tensor_operation {
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance.cpp
new file mode 100644
index 0000000000..363ffbf057
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance.cpp
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_instances<
+            v1,
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance_v2.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance_v2.cpp
new file mode 100644
index 0000000000..4152e10305
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance_v2.cpp
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instances_v2(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_instances<
+            v2,
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance.cpp
new file mode 100644
index 0000000000..8952643bb8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance.cpp
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_instances<
+            v1,
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance_v2.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance_v2.cpp
new file mode 100644
index 0000000000..abc0c8bb4e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance_v2.cpp
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instances_v2(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_instances<
+            v2,
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_mx/CMakeLists.txt
new file mode 100644
index 0000000000..a166fc4ce4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/CMakeLists.txt
@@ -0,0 +1,14 @@
+# ONLY MX_KERNELS
+set(GEMM_MX_INSTANCES)
+
+list(APPEND GEMM_MX_INSTANCES 
+        device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instance.cpp
+        device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instance.cpp
+    )
+
+
+set_source_files_properties(device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
+
+add_instance_library(device_gemm_mx_instance ${GEMM_MX_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp
new file mode 100644
index 0000000000..1e979f69ca
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8   = f8_t;
+using F16  = half_t;
+using BF16 = bhalf_t;
+using F32  = float;
+using E8M0 = ck::e8m0_bexp_t;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+static constexpr auto ScaleBlockSize = 32;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_instances = std::tuple<
+// clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData|AScale|BData|BScale| CData| AccData| Cshuffle|           A|           B|           C|          GEMM|    Scale Block| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Data| Type|  Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |  Type|     |  Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |     |      |      |        |         |            |            |            |              |               |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+#if defined(__gfx950__) || defined(CK_USE_NATIVE_MX_SUPPORT)
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,   128,   16,    128,  16,  16,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   256,  16,  16,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      false,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     false,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   128,    64,  16,  16,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,    64,    16,    16,   512,  16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>
+
+//Require verification
+      //DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   256,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>
+#endif
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instance.cpp
new file mode 100644
index 0000000000..05914e06b5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instance.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMX<Row,
+                                             Col,
+                                             Row,
+                                             F8,
+                                             E8M0,
+                                             F8,
+                                             E8M0,
+                                             BF16,
+                                             32,
+                                             PassThrough,
+                                             PassThrough,
+                                             PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_instances<Intrawave, GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp
new file mode 100644
index 0000000000..0ca4f2a3ce
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8   = f8_t;
+using F16  = half_t;
+using BF16 = bhalf_t;
+using F32  = float;
+using E8M0 = ck::e8m0_bexp_t;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+static constexpr auto ScaleBlockSize = 32;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_instances = std::tuple<
+// clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData|AScale|BData|BScale| CData| AccData| Cshuffle|           A|           B|           C|          GEMM|    Scale Block| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Data| Type|  Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |  Type|     |  Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |     |      |      |        |         |            |            |            |              |               |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+#if defined(__gfx950__) || defined(CK_USE_NATIVE_MX_SUPPORT)
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,   128,   16,    128,  16,  16,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   256,  16,  16,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      false,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     false,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   128,    64,  16,  16,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,    64,    16,    16,   512,  16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>
+
+      //Require verification
+      //DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   256,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+#endif
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instance.cpp
new file mode 100644
index 0000000000..f4e59cf92d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instance.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMX<Row,
+                                             Col,
+                                             Row,
+                                             F8,
+                                             E8M0,
+                                             F8,
+                                             E8M0,
+                                             F16,
+                                             32,
+                                             PassThrough,
+                                             PassThrough,
+                                             PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_instances<Intrawave, GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_interwave_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_interwave_instance.cpp
index 385f474ecc..efc7a7ebfd 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_interwave_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_interwave_instance.cpp
@@ -33,13 +33,11 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // Compilation parameters for a[m, k] * b[k, n] = c[m, n]
 template <ck::tensor_operation::device::GemmSpecialization GemmSpec>
 using device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_iw_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################|AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //#########################| Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //#########################|     |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //#########################|     |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         //PipelineVersion::v1; interwave
         DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8,  F16, PipelineVersion::v1, LoopScheduler::Interwave>,
         DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8,  F16, PipelineVersion::v1, LoopScheduler::Interwave>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_irregular_instance.cpp
index 0b48bbf606..6a323d323f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_irregular_instance.cpp
@@ -34,13 +34,11 @@ template <ck::tensor_operation::device::GemmSpecialization GemmSpec,
           ck::PipelineVersion PipVer,
           ck::LoopScheduler LoopSche>
 using device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################|AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //#########################| Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //#########################|     |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //#########################|     |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,    16,   128,     4,  8,   16,   16,    1,    4,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               4,  F16, PipVer, LoopSche>,
         DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,    16,   256,     4,  8,   16,   16,    1,    8,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              8,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               4,  F16, PipVer, LoopSche>,
         DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,    16,   256,     4,  8,   16,   16,    1,    4,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 16>,              4,  F16, PipVer, LoopSche>,        
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v1_interwave_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v1_interwave_instance.cpp
index 422db05b35..2855235f97 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v1_interwave_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v1_interwave_instance.cpp
@@ -33,14 +33,12 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // Compilation parameters for a[m, k] * b[k, n] = c[m, n]
 template <ck::tensor_operation::device::GemmSpecialization GemmSpec>
 using device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v1_iw_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################|AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //#########################| Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //#########################|     |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //#########################|     |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
         //PipelineVersion::v1; interwave
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              8,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8,  F16, PipelineVersion::v1, LoopScheduler::Interwave>,
         DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              8,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8,  F16, PipelineVersion::v1, LoopScheduler::Interwave>,
         DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              8,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8,  F16, PipelineVersion::v1, LoopScheduler::Interwave>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt
index ade65eacf3..18eeefa522 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt
@@ -1,7 +1,17 @@
-# ONLY XDL_KERNELS
+# ONLY XDL_AND_WMMA_KERNELS
 set(GEMM_UNIVERSAL_INSTANCES)
 
-list(APPEND GEMM_UNIVERSAL_INSTANCES 
+list(APPEND GEMM_UNIVERSAL_INSTANCES
+        device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_default_instance.cpp
+
+        device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp
+
         device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp
         device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
         device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
@@ -18,7 +28,7 @@ list(APPEND GEMM_UNIVERSAL_INSTANCES
         device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp
         device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp
         device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
-        
+
         device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
         device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp
         device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
@@ -57,6 +67,16 @@ list(APPEND GEMM_UNIVERSAL_INSTANCES
         device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instance.cpp
         )
 
+set_source_files_properties(device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
+set_source_files_properties(device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
 set_source_files_properties(device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
@@ -80,6 +100,9 @@ set_source_files_properties(device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm
 set_source_files_properties(device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 
 list(APPEND GEMM_UNIVERSAL_INSTANCES
+          device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp
+          device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
+
           device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp
           device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp
           device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp
@@ -134,25 +157,28 @@ list(APPEND GEMM_UNIVERSAL_INSTANCES
           device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
         )
 
- set_source_files_properties(device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-        
- set_source_files_properties(device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 
- set_source_files_properties(device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
+set_source_files_properties(device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
+set_source_files_properties(device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 
 
 add_instance_library(device_gemm_universal_instance ${GEMM_UNIVERSAL_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn.hpp
new file mode 100644
index 0000000000..5d3bb3f7b4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn.hpp
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    32,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   256,    64,    64,   8,   8,   16,   16,       8,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    16,    64,    64,   8,   8,   16,   16,       1,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    32,    64,   8,   8,   16,   16,       1,       2,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..c9a730de68
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn.hpp
new file mode 100644
index 0000000000..6c3a641f9f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn.hpp
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    32,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   256,    64,    64,   8,   8,   16,   16,       8,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    16,    64,    64,   8,   8,   16,   16,       1,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    32,    64,   8,   8,   16,   16,       1,       2,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..cd88edec59
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn.hpp
new file mode 100644
index 0000000000..b700e78d3d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn.hpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    32,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   256,    64,    64,   8,   8,   16,   16,       8,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    16,    64,    64,   8,   8,   16,   16,       1,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    32,    64,   8,   8,   16,   16,       1,       2,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        // Configurations used during development, mainly for testing
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         0,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..9951c02251
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn.hpp
new file mode 100644
index 0000000000..7b4cd64d33
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn.hpp
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    32,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   256,    64,    64,   8,   8,   16,   16,       8,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    16,    64,    64,   8,   8,   16,   16,       1,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    32,    64,   8,   8,   16,   16,       1,       2,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..3a607c4178
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn.hpp
new file mode 100644
index 0000000000..3751dc5a11
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn.hpp
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    32,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   256,    64,    64,   8,   8,   16,   16,       8,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    16,    64,    64,   8,   8,   16,   16,       1,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    32,    64,   8,   8,   16,   16,       1,       2,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..3971802415
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f16_f16_km_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn.hpp
new file mode 100644
index 0000000000..222b49eb7d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn.hpp
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    32,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   256,    64,    64,   8,   8,   16,   16,       8,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    16,    64,    64,   8,   8,   16,   16,       1,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    32,    64,   8,   8,   16,   16,       1,       2,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..36901b4f38
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f16_f16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp
new file mode 100644
index 0000000000..6960375ed6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    32,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   256,    64,    64,   8,   8,   16,   16,       8,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    16,    64,    64,   8,   8,   16,   16,       1,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    32,    64,   8,   8,   16,   16,       1,       2,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        // Configurations used during development, mainly for testing
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         0,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..bbc8b92217
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn.hpp
new file mode 100644
index 0000000000..7f71cf6f59
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn.hpp
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    32,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   256,    64,    64,   8,   8,   16,   16,       8,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    16,    64,    64,   8,   8,   16,   16,       1,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    32,    64,   8,   8,   16,   16,       1,       2,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..331ca8b2ff
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn.hpp
new file mode 100644
index 0000000000..2fca3551b4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn.hpp
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8   = f8_t;
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm| Compute| Compute|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|   TypeA|   TypeB|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |        |        |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |        |        |
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         0,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,      F8,      F8>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..5087a9d719
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    if(ck::is_gfx11_supported())
+        return;
+
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn.hpp
new file mode 100644
index 0000000000..244eb69190
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn.hpp
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8   = f8_t;
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm| Compute| Compute|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|   TypeA|   TypeB|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |        |        |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |        |        |
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    32,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,      F8,      F8>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..89df765517
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    if(ck::is_gfx11_supported())
+        return;
+
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn.hpp
index 5540d2d884..59154f3439 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn.hpp
@@ -36,13 +36,11 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_comp_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   2,   2,  32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              2,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
@@ -60,13 +58,11 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_comp_instances = std::tu
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Latency friendly
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              2,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp
index d9f9969621..b962d75b12 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp
@@ -36,29 +36,20 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Compute friendly
-#if !defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   8,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   8,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   8,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   8,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-#if !defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 32, 1, 8>,                4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   8,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   8,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   8,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
@@ -66,16 +57,23 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_instances = std::tu
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   2,   2,  32,   32,    2,    2,     S<32, 8, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
+// instances not working on gfx950
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_instances_part2 = std::tuple<
+    // clang-format off
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   8,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   8,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 32, 1, 8>,                4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>
+    // clang-format on
+    >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Latency friendly 
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   8,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
@@ -90,15 +88,19 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_instances = std::tup
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   8,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   4,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   4,   8,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-#if !defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   4,   8,  16,   16,    1,    2,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   4,   4,  16,   16,    1,    2,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   4,   8,  16,   16,    1,    4,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   2,   8,  16,   16,    1,    4,     S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   2,   2,  16,   16,    1,    4,     S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
     // clang-format on
     >;
+// instances not working on gfx950
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_instances_part2 = std::tuple<
+    // clang-format off
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   4,   8,  16,   16,    1,    2,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+    // clang-format on
+    >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp
index 02272e84b3..5396c16cb8 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -15,6 +16,13 @@ void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instance
 {
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_instances<GemmDefault>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_instances_part2<GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instance.cpp
index 89acbb6f68..5cc2d987bc 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -16,6 +17,13 @@ void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instanc
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_instances<GemmKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_instances_part2<GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instance.cpp
index 52227620b4..3f7e2fe302 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -16,6 +17,14 @@ void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instan
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_instances<GemmMKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_instances_part2<
+                GemmMKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instance.cpp
index 311b1d0b28..550fbce684 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -16,6 +17,13 @@ void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instanc
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_instances<GemmMPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_instances_part2<GemmMPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instance.cpp
index 40a9239ad8..01fcabc872 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -16,6 +17,14 @@ void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instan
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_instances<Intrawave, GemmDefault>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_instances_part2<Intrawave,
+                                                                                  GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instance.cpp
index 3a9dc7b081..22ad32465e 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -16,6 +17,14 @@ void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_insta
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_instances<Intrawave, GemmKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_instances_part2<Intrawave,
+                                                                                  GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instance.cpp
index 8fef199b25..335bbb6d36 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -17,6 +18,14 @@ void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_inst
         instances,
         device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_instances<Intrawave,
                                                                         GemmMKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_instances_part2<Intrawave,
+                                                                                  GemmMKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instance.cpp
index 4f2c95e20d..3102f7c158 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -16,6 +17,14 @@ void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instan
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_instances<Interwave, GemmDefault>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_instances_part2<Interwave,
+                                                                                  GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instance.cpp
index c58cfb592f..7c0ce3019f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -16,6 +17,14 @@ void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_insta
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_instances<Interwave, GemmKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_instances_part2<Interwave,
+                                                                                  GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instance.cpp
index 7cd3bfc882..e54c611e73 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -17,6 +18,14 @@ void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_inst
         instances,
         device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_instances<Interwave,
                                                                         GemmMKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_instances_part2<Interwave,
+                                                                                  GemmMKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp
index 0619a98cf0..9f142ad831 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp
@@ -36,39 +36,37 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
-#if !defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-#if !defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           2,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
+// instances not working on gfx950
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances_part2 = std::tuple<
+    // clang-format off
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           2,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>
+    // clang-format on
+    >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Latency friendly
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,   128,   8,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
@@ -83,13 +81,18 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_instances = std::tup
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   4,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-#if !defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   8,   4,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   8,   4,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   8,   4,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
     // clang-format on
     >;
+// instances not working on gfx950
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_instances_part2 = std::tuple<
+    // clang-format off
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   8,   4,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   8,   4,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+    // clang-format on
+    >;
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
index 246e7a5067..e86d0a6ff9 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -15,6 +16,13 @@ void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance
 {
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances<GemmDefault>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances_part2<GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp
index 9737dd5f0c..5f619d2850 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -16,6 +17,13 @@ void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instanc
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances<GemmKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances_part2<GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
index 01f1315646..6a1ad1e890 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -16,6 +17,14 @@ void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_insta
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances<GemmMNKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances_part2<
+                GemmMNKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
index 0d1cb4f25b..65c1cbce96 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -16,6 +17,14 @@ void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instan
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances<GemmMNPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances_part2<
+                GemmMNPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instance.cpp
index 377e2f90a6..d98dd7b99d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -16,6 +17,14 @@ void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instan
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_instances<Intrawave, GemmDefault>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_instances_part2<Intrawave,
+                                                                                  GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp
index 2ad1d1e52f..9da5e8b1cd 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -16,6 +17,14 @@ void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_insta
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_instances<Intrawave, GemmKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_instances_part2<Intrawave,
+                                                                                  GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp
index f82fb92302..fea562d208 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -17,6 +18,15 @@ void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_ins
         instances,
         device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_instances<Intrawave,
                                                                         GemmMNKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_instances_part2<
+                Intrawave,
+                GemmMNKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instance.cpp
index 2f8abf0a88..cc05571615 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -16,6 +17,14 @@ void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instan
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_instances<Interwave, GemmDefault>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_instances_part2<Interwave,
+                                                                                  GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp
index 2e0f670aad..939adc4c90 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -16,6 +17,14 @@ void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_insta
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_instances<Interwave, GemmKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_instances_part2<Interwave,
+                                                                                  GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
index d7dc599748..0f3d081cf7 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -17,6 +18,15 @@ void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_ins
         instances,
         device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_instances<Interwave,
                                                                         GemmMNKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_instances_part2<
+                Interwave,
+                GemmMNKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
index 3edbd28cd8..7d141a47e1 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
@@ -36,44 +36,42 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Compute friendly
-#if !defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-#if !defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-        // AGPR Spill
-        // DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-        // AGPR Spill when use permuted lds layout. so, use padding for these two.
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 32, 1, 8>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   192,    64,   8,   8,  32,   32,    4,    3,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
+// instances not working on gfx950
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances_part2 = std::tuple<
+    // clang-format off
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        // AGPR Spill
+        // DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        // AGPR Spill when use permuted lds layout. so, use padding for these two.
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 32, 1, 8>,                4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   192,    64,   8,   8,  32,   32,    4,    3,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,                4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>
+    // clang-format on
+    >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Latency friendly 
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 64, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
@@ -90,13 +88,17 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_instances = std::tup
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-#if !defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   8,   8,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
     // clang-format on
     >;
+// instances not working on gfx950
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_instances_part2 = std::tuple<
+    // clang-format off
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   8,   8,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+    // clang-format on
+    >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
index cbfaf9aaa0..d30ce71d6a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -15,6 +16,13 @@ void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance
 {
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances<GemmDefault>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances_part2<GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instance.cpp
index 3b0ccf9d83..b908cacb4b 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -16,6 +17,13 @@ void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instanc
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances<GemmKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances_part2<GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp
index 3800b7955d..301395c4e6 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -16,6 +17,14 @@ void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instan
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_instances<Intrawave, GemmDefault>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_instances_part2<Intrawave,
+                                                                                  GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
index 9e78df7b5e..ddaa504345 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -16,6 +17,14 @@ void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_insta
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_instances<Intrawave, GemmKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_instances_part2<Intrawave,
+                                                                                  GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp
index 21f118ea60..b08137c17f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -16,6 +17,14 @@ void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instan
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_instances<Interwave, GemmDefault>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_instances_part2<Interwave,
+                                                                                  GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
index 3e841a8498..ed553738b1 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -16,6 +17,14 @@ void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_insta
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_instances<Interwave, GemmKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_instances_part2<Interwave,
+                                                                                  GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp
index e3f3afff46..940da94e70 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp
@@ -34,22 +34,17 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-#if !defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    32,   8,   4,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
@@ -58,16 +53,21 @@ using device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_instances = std::tuple
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   2,   2,  32,   32,    2,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32,  8, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
+// instances not working on gfx950
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_instances_part2 = std::tuple<
+    // clang-format off
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>
+    // clang-format on
+    >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Latency friendly
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,   128,   8,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp
index 41d6481c9f..8e5ce4ded4 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -15,6 +16,13 @@ void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_default_instances(
 {
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_instances<GemmDefault>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_instances_part2<GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
index de41821d98..1922e2b9ab 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -15,6 +16,13 @@ void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instances(
 {
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_instances<GemmKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_instances_part2<GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
index cdde9fa43c..8cfcb85aa0 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -15,6 +16,13 @@ void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance
 {
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_instances<GemmMNKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_instances_part2<GemmMNKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
index 04237cc62c..e502314388 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -15,6 +16,13 @@ void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instances
 {
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_instances<GemmMNPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_instances_part2<GemmMNPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp
index e39c9a63b9..d83014d5e8 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp
@@ -34,13 +34,11 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Compute friendly
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
@@ -57,13 +55,9 @@ using device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_instances = std::tuple
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  16,   16,    8,    8,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-#if !defined(CK_USE_AMD_MFMA_GFX950)
         // AGPR Spill
         // DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         // AGPR Spill when use permuted lds layout. so, use padding for these two.
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 64, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    32,   8,   8,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
@@ -71,16 +65,22 @@ using device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_instances = std::tuple
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
+// instances not working on gfx950
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_instances_part2 = std::tuple<
+    // clang-format off
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 64, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>
+    // clang-format on
+    >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Latency friendly 
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 64, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
@@ -104,10 +104,6 @@ using device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_instances = std::tuple<
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-#if !defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   8,   8,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   64,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,  128,    64,   8,   8,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
@@ -116,6 +112,14 @@ using device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_instances = std::tuple<
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    64,   2,   2,  32,   32,    1,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
     // clang-format on
     >;
+// instances not working on gfx950
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_instances_part2 = std::tuple<
+    // clang-format off
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   8,   8,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+    // clang-format on
+    >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp
index 77addd6ad2..ecae32aa18 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -15,6 +16,13 @@ void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_default_instances(
 {
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_instances<GemmDefault>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_instances_part2<GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp
index 4fb034d3b0..f65f47a08a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -15,6 +16,13 @@ void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_kpadding_instances(
 {
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_instances<GemmKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_instances_part2<GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp
index 7f7ec14ba6..3d16b5c282 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -16,6 +17,14 @@ void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v1_default_instances
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_instances<Intrawave, GemmDefault>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_instances_part2<Intrawave,
+                                                                               GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp
index 32634a6129..b834022c91 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -16,6 +17,14 @@ void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instance
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_instances<Intrawave, GemmKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_instances_part2<Intrawave,
+                                                                               GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp
index 3062add942..50063c81ad 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -16,6 +17,14 @@ void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v2_default_instances
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_instances<Interwave, GemmDefault>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_instances_part2<Interwave,
+                                                                               GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
index ede5e4c428..99469f5436 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -16,6 +17,14 @@ void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_instances<Interwave, GemmKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_instances_part2<Interwave,
+                                                                               GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn.hpp
index 43dc6be076..ff13de1d6a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn.hpp
@@ -35,36 +35,36 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         // Disable due to test failure
         // DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-#if !defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   4,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    32,   8,   4,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
+// instances not working on gfx950
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_instances_part2 = std::tuple<
+    // clang-format off
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   4,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>
+    // clang-format on
+    >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Latency friendly
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   256,   8,   4,  16,   16,    1,    1,     S<32, 2, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<64, 1, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,   256,   8,   4,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<64, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp
index f3e96e83f8..0a86fd3f9e 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -15,6 +16,13 @@ void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_default_instances(
 {
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_instances<GemmDefault>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_instances_part2<GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp
index d73b75fcdc..e88928efbc 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -15,6 +16,13 @@ void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instances(
 {
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_instances<GemmKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_instances_part2<GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
index 19894a4402..05b3e3762d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -15,6 +16,13 @@ void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instances
 {
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_instances<GemmMNKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_instances_part2<GemmMNKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp
index f1123e5715..83db356dc8 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -15,6 +16,13 @@ void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnpadding_instances(
 {
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_instances<GemmMNPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_instances_part2<GemmMNPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn.hpp
index 9bdb2f51c2..bb10da37f4 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn.hpp
@@ -35,33 +35,33 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Compute friendly
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-#if !defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,  16,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
+// instances not working on gfx950
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_instances_part2 = std::tuple<
+    // clang-format off
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,  16,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>
+    // clang-format on
+    >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Latency friendly 
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,   128,   8,   16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   16,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp
index b6d916f26c..bae0d6f7c6 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -15,6 +16,13 @@ void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_default_instances(
 {
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_instances<GemmDefault>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_instances_part2<GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp
index e72a748e96..906368f54d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -15,6 +16,13 @@ void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_kpadding_instances(
 {
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_instances<GemmKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_instances_part2<GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp
index 616133e1ba..315a6446c1 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp
@@ -47,11 +47,6 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_instances = std::tuple<
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  16,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,  16,   4,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,   128,  16,   8,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-#if !defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  16,   4,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,   128,  16,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  16,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  16,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    64,  16,   4,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    64,  16,   4,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
@@ -61,6 +56,23 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_instances = std::tuple<
 #endif
     // clang-format on
     >;
+// instances not working on gfx950
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_instances_part2 = std::tuple<
+// clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        //Only enable these instances on gfx94x
+        // Compute friendly
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  16,   4,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,   128,  16,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  16,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+#endif
+    // clang-format on
+    >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_instances = std::tuple<
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp
index 96f171e066..e2f843ac58 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -15,6 +16,13 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_default_instances(
 {
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_instances<GemmDefault>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_instances_part2<GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_kpadding_instance.cpp
index 4965fe51c6..3778848db3 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_kpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -15,6 +16,13 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_kpadding_instances(
 {
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_instances<GemmKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_instances_part2<GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instance.cpp
index d325c47d8a..2e6224c139 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -15,6 +16,13 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instances(
 {
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_instances<GemmNKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_instances_part2<GemmNKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
index 6388c13444..27d7933477 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
@@ -42,16 +42,7 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances = std::tuple<
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
 #if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         // Compute friendly
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  16,  16,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,  16,  16,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  16,  16,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-#if !defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    128, 16,  16,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-#endif // !defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    128, 16,  16,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   192,    128, 16,  16,  32,   32,    4,    3,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    128, 16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    128, 16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    64,  16,  16,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
@@ -63,6 +54,26 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances = std::tuple<
 #endif
     // clang-format on
     >;
+// instances not working on gfx950
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances_part2 = std::tuple<
+// clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        // Compute friendly
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,  16,  16,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  16,  16,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  16,  16,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    128, 16,  16,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   192,    128, 16,  16,  32,   32,    4,    3,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    128, 16,  16,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+#endif
+    // clang-format on
+    >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances = std::tuple<
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
index 48581a7344..d6c9809020 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -15,6 +16,13 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instances(
 {
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances<GemmDefault>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances_part2<GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
index 09bc544deb..fc6ad01742 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -15,6 +16,13 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instances(
 {
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances<GemmKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances_part2<GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
index 7fb690c8b2..1cc0b8c76e 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
@@ -50,9 +50,6 @@ using device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances =
         DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-#if !defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 32, 1, 8>,          S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   160,    64,   8,   8,  16,   16,    8,    5,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 32, 1, 8>,          S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   160,    64,   8,   8,  32,   32,    1,    5,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 64, 1, 4>,          S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
@@ -62,6 +59,15 @@ using device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances =
         DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
+// instances not working on gfx950
+template <GemmSpecialization GemmSpec,
+          typename DsLayout   = ck::Tuple<>,
+          typename DsDataType = ck::Tuple<>>
+using device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances_part2 = std::tuple<
+    // clang-format off
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>
+    // clang-format on
+    >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched,
           GemmSpecialization GemmSpec,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
index 12aa7c3804..3c17d4cbc0 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -24,6 +25,14 @@ void add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_
     add_device_operation_instances(
         instances,
         device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances<GemmDefault>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances_part2<
+                GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp
index 68c6ce89ab..6cadb7deb8 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp
@@ -39,20 +39,15 @@ template <GemmSpecialization GemmSpec,
           typename DsLayout   = ck::Tuple<>,
           typename DsDataType = ck::Tuple<>>
 using device_gemm_xdl_universal_reduce_bf16_bf16_bf16_mk_kn_mn_comp_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout|  DsLayout| CLayout| AData| BData|      DsData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|                Block-wiseGemm|
         //#########################|        |        |          |        |  Type|  Type|        Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                      Pipeline|
         //#########################|        |        |          |        |      |      |            |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                      Verision|
         //#########################|        |        |          |        |      |      |            |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                              |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-#if !defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   256,    32,   8,   4,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
@@ -60,19 +55,26 @@ using device_gemm_xdl_universal_reduce_bf16_bf16_bf16_mk_kn_mn_comp_instances =
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
+// instances not working on gfx950
+template <GemmSpecialization GemmSpec,
+          typename DsLayout   = ck::Tuple<>,
+          typename DsDataType = ck::Tuple<>>
+using device_gemm_xdl_universal_reduce_bf16_bf16_bf16_mk_kn_mn_comp_instances_part2 = std::tuple<
+    // clang-format off
+        DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>
+    // clang-format on
+    >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched,
           GemmSpecialization GemmSpec,
           typename DsLayout   = ck::Tuple<>,
           typename DsDataType = ck::Tuple<>>
 using device_gemm_xdl_universal_reduce_bf16_bf16_bf16_mk_kn_mn_mem_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout|  DsLayout| CLayout|  AData| BData|      DsData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|                Block-wiseGemm|
         //#########################|        |        |          |        |   Type|  Type|        Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                      Pipeline|
         //#########################|        |        |          |        |       |      |            |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                      Verision|
         //#########################|        |        |          |        |       |      |            |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                              |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Latency friendly
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,   BF16,   BF16, DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   128,    32,    16,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,   BF16,   BF16, DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,    64,    16,    16,   128,   8,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
index bd5cfc5f20..4be2db2531 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -24,6 +25,14 @@ void add_device_gemm_xdl_universal_reduce_bf16_bf16_bf16_mk_kn_mn_comp_default_i
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_reduce_bf16_bf16_bf16_mk_kn_mn_comp_instances<GemmDefault>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_reduce_bf16_bf16_bf16_mk_kn_mn_comp_instances_part2<
+                GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp
index f5c2c95a03..93fa56c1a5 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -24,6 +25,14 @@ void add_device_gemm_xdl_universal_reduce_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_reduce_bf16_bf16_bf16_mk_kn_mn_comp_instances<GemmKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_reduce_bf16_bf16_bf16_mk_kn_mn_comp_instances_part2<
+                GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
index 978a027048..cd032e2a23 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -24,6 +25,14 @@ void add_device_gemm_xdl_universal_reduce_bf16_bf16_bf16_mk_kn_mn_comp_mnkpaddin
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_reduce_bf16_bf16_bf16_mk_kn_mn_comp_instances<GemmMNKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_reduce_bf16_bf16_bf16_mk_kn_mn_comp_instances_part2<
+                GemmMNKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
index c8296e9bfc..3ff8d8e172 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -24,6 +25,14 @@ void add_device_gemm_xdl_universal_reduce_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_reduce_bf16_bf16_bf16_mk_kn_mn_comp_instances<GemmMNPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_reduce_bf16_bf16_bf16_mk_kn_mn_comp_instances_part2<
+                GemmMNPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_i8_bf16/device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_i8_bf16/device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn.hpp
index 4e1e5567d5..c753a91d0a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_i8_bf16/device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_i8_bf16/device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn.hpp
@@ -40,15 +40,11 @@ template <GemmSpecialization GemmSpec,
           typename DsLayout   = ck::Tuple<>,
           typename DsDataType = ck::Tuple<>>
 using device_gemm_xdl_universal_reduce_bf16_i8_bf16_mk_kn_mn_comp_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout| DsLayout| CLayout|AData| BData|    DsData|  CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                          Block-wiseGemm|                Block-wiseGemm|
         //#########################|        |        |         |        | Type|  Type|      Type|   Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                                Pipeline|                      Pipeline|
         //#########################|        |        |         |        |     |      |          |       |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                               Scheduler|                      Verision|
         //#########################|        |        |         |        |     |      |          |       |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                        |                              |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        //DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row, DsLayout,     Row, BF16,   I8,  DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,   128,  16,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-        DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row, DsLayout,     Row, BF16,   I8,  DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,    32,   128,   128,  16,   4,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v2>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row, DsLayout,     Row, BF16,   I8,  DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row, DsLayout,     Row, BF16,   I8,  DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row, DsLayout,     Row, BF16,   I8,  DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
@@ -59,20 +55,30 @@ using device_gemm_xdl_universal_reduce_bf16_i8_bf16_mk_kn_mn_comp_instances = st
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row, DsLayout,     Row, BF16,   I8,  DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
+// double rate mfma instances on gfx950
+template <GemmSpecialization GemmSpec,
+          typename DsLayout   = ck::Tuple<>,
+          typename DsDataType = ck::Tuple<>>
+using device_gemm_xdl_universal_reduce_bf16_i8_bf16_mk_kn_mn_comp_instances_2x = std::tuple<
+    // clang-format off
+        //#########################| ALayout| BLayout| DsLayout| CLayout|AData| BData|    DsData|  CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                          Block-wiseGemm|                Block-wiseGemm|
+        //#########################|        |        |         |        | Type|  Type|      Type|   Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                                Pipeline|                      Pipeline|
+        //#########################|        |        |         |        |     |      |          |       |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                               Scheduler|                      Verision|
+        //#########################|        |        |         |        |     |      |          |       |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                        |                              |
+        DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row, DsLayout,     Row, BF16,   I8,  DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,    32,   128,   128,  16,   4,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v2>
+    // clang-format on
+    >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched,
           GemmSpecialization GemmSpec,
           typename DsLayout   = ck::Tuple<>,
           typename DsDataType = ck::Tuple<>>
 using device_gemm_xdl_universal_reduce_bf16_i8_bf16_mk_kn_mn_mem_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout| DsLayout| CLayout|AData| BData|     DsData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|                Block-wiseGemm|
         //#########################|        |        |         |        | Type|  Type|       Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                      Pipeline|
         //#########################|        |        |         |        |     |      |           |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                      Verision|
         //#########################|        |        |         |        |     |      |           |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                              |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        //DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row, DsLayout,     Row, BF16,   I8,  DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   128,    32,  128,   128,  16,   4,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
-#else
         // Latency friendly
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row, DsLayout,     Row, BF16,   I8,  DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,    64,    16,   16,   256,   8,   4,  16,   16,    1,    1,     S<32, 2, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<64, 1, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row, DsLayout,     Row, BF16,   I8,  DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   128,    16,   32,   256,   8,   4,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<64, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
@@ -85,7 +91,6 @@ using device_gemm_xdl_universal_reduce_bf16_i8_bf16_mk_kn_mn_mem_instances = std
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row, DsLayout,     Row, BF16,   I8,  DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   128,    32,  128,    64,   8,   4,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row, DsLayout,     Row, BF16,   I8,  DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   256,    16,  256,    64,   8,   4,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row, DsLayout,     Row, BF16,   I8,  DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   256,    32,  256,    64,   8,   4,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_i8_bf16/device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_i8_bf16/device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn_comp_default_instance.cpp
index 66db3ddb3d..0ad6c2c75a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_i8_bf16/device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn_comp_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_i8_bf16/device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn_comp_default_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -24,6 +25,14 @@ void add_device_gemm_xdl_universal_reduce_bf16_i8_bf16_mk_kn_mn_comp_default_ins
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_reduce_bf16_i8_bf16_mk_kn_mn_comp_instances<GemmDefault>{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_reduce_bf16_i8_bf16_mk_kn_mn_comp_instances_2x<
+                GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_i8_bf16/device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_i8_bf16/device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn_comp_kpadding_instance.cpp
index 7d3c832ee5..b5f784b9aa 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_i8_bf16/device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn_comp_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_i8_bf16/device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn_comp_kpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -24,6 +25,14 @@ void add_device_gemm_xdl_universal_reduce_bf16_i8_bf16_mk_kn_mn_comp_kpadding_in
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_reduce_bf16_i8_bf16_mk_kn_mn_comp_instances<GemmKPadding>{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_reduce_bf16_i8_bf16_mk_kn_mn_comp_instances_2x<
+                GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_i8_bf16/device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_i8_bf16/device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
index 2759b878a3..1c48fea463 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_i8_bf16/device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_i8_bf16/device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -24,6 +25,14 @@ void add_device_gemm_xdl_universal_reduce_bf16_i8_bf16_mk_kn_mn_comp_mnkpadding_
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_reduce_bf16_i8_bf16_mk_kn_mn_comp_instances<GemmMNKPadding>{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_reduce_bf16_i8_bf16_mk_kn_mn_comp_instances_2x<
+                GemmMNKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_i8_bf16/device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_i8_bf16/device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
index 04a0229300..e500ebde87 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_i8_bf16/device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_i8_bf16/device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -24,6 +25,14 @@ void add_device_gemm_xdl_universal_reduce_bf16_i8_bf16_mk_kn_mn_comp_mnpadding_i
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_reduce_bf16_i8_bf16_mk_kn_mn_comp_instances<GemmMNPadding>{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_reduce_bf16_i8_bf16_mk_kn_mn_comp_instances_2x<
+                GemmMNPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp
index 928c325ab7..d4fed3b561 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp
@@ -39,20 +39,15 @@ template <GemmSpecialization GemmSpec,
           typename DsLayout   = ck::Tuple<>,
           typename DsDataType = ck::Tuple<>>
 using device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_comp_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout|  DsLayout| CLayout|AData| BData|     DsData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |          |        | Type|  Type|       Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |          |        |     |      |           |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //#########################|        |        |          |        |     |      |           |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  F16,   F16, DsDataType,   F16,   F32,     F16,    PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  F16,   F16, DsDataType,   F16,   F32,     F16,    PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  F16,   F16, DsDataType,   F16,   F32,     F16,    PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  F16,   F16, DsDataType,   F16,   F32,     F16,    PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-#if !defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  F16,   F16, DsDataType,   F16,   F32,     F16,    PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  F16,   F16, DsDataType,   F16,   F32,     F16,    PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  F16,   F16, DsDataType,   F16,   F32,     F16,    PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  F16,   F16, DsDataType,   F16,   F32,     F16,    PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    32,   8,   4,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
@@ -60,19 +55,26 @@ using device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_comp_instances = std
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  F16,   F16, DsDataType,   F16,   F32,     F16,    PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
+// instances not working on gfx950
+template <GemmSpecialization GemmSpec,
+          typename DsLayout   = ck::Tuple<>,
+          typename DsDataType = ck::Tuple<>>
+using device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_comp_instances_part2 = std::tuple<
+    // clang-format off
+        DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  F16,   F16, DsDataType,   F16,   F32,     F16,    PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>
+    // clang-format on
+    >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched,
           GemmSpecialization GemmSpec,
           typename DsLayout   = ck::Tuple<>,
           typename DsDataType = ck::Tuple<>>
 using device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_mem_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout| DsLayout| CLayout|AData| BData|    DsData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |         |        | Type|  Type|      Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |         |        |     |      |          |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |         |        |     |      |          |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Latency friendly
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,    Row,  F16,   F16, DsDataType,  F16,   F32,     F16,    PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,    Row,  F16,   F16, DsDataType,  F16,   F32,     F16,    PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp
index 53d2f8f0c3..00d88467e8 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -24,6 +25,14 @@ void add_device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_comp_default_inst
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_comp_instances<GemmDefault>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_comp_instances_part2<
+                GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
index 11f696c509..a7e60a6d40 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -24,6 +25,14 @@ void add_device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_comp_kpadding_ins
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_comp_instances<GemmKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_comp_instances_part2<
+                GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
index 34e129bdcb..7ce358c695 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -24,6 +25,14 @@ void add_device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_comp_mnkpadding_i
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_comp_instances<GemmMNKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_comp_instances_part2<
+                GemmMNKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
index 7a580b6716..e64c145987 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -24,6 +25,14 @@ void add_device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_comp_mnpadding_in
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_comp_instances<GemmMNPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_comp_instances_part2<
+                GemmMNPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt
old mode 100644
new mode 100755
index e1612bcd24..b7391d3446
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt
@@ -21,9 +21,7 @@ list(APPEND GEMM_UNIVERSAL_STREAMK_INSTANCES
         device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
         device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp
         device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
-
         device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
-
         device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp
         device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp
         device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp
@@ -44,7 +42,6 @@ list(APPEND GEMM_UNIVERSAL_STREAMK_INSTANCES
         device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instance.cpp
         device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
         device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
-
         device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp
         device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
         device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
@@ -65,7 +62,6 @@ list(APPEND GEMM_UNIVERSAL_STREAMK_INSTANCES
         device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp
         device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
         device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
-        
         device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp
         device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instance.cpp
         device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instance.cpp
@@ -101,6 +97,21 @@ list(APPEND GEMM_UNIVERSAL_STREAMK_INSTANCES
         device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp
         device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
         device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp
-        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp)
-
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_default_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_nkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_default_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_nkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
+)
 add_instance_library(device_gemm_universal_streamk_instance ${GEMM_UNIVERSAL_STREAMK_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp
index 2a02995827..5353fe16b5 100755
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp
@@ -48,14 +48,18 @@ using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_instances =
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-#if !defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           2,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
+// instances not working on gfx950
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_instances_part2 = std::tuple<
+    // clang-format off
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           2,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>
+    // clang-format on
+    >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_instances = std::tuple<
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
old mode 100755
new mode 100644
index 8b2bfb5d26..9d95e6ea5a
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -22,6 +23,14 @@ void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_default_
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_instances<GemmDefault>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_instances_part2<
+                GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp
old mode 100755
new mode 100644
index a7c33ffdc4..3e54a4bb5b
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -22,6 +23,14 @@ void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_kpadding
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_instances<GemmKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_instances_part2<
+                GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
old mode 100755
new mode 100644
index adc2f23d40..544723ef6a
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -22,6 +23,14 @@ void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnkpaddi
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_instances<GemmMNKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_instances_part2<
+                GemmMNKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
old mode 100755
new mode 100644
index 0336f64665..e6959c0945
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -22,6 +23,14 @@ void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnpaddin
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_instances<GemmMNPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_instances_part2<
+                GemmMNPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp
index 425c2c0391..959c1c0992 100755
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp
@@ -51,11 +51,9 @@ using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_instances =
         // AGPR Spill
         // DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         // AGPR Spill when use permuted lds layout. so, use padding for these two.
-#if !defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
 
-#endif // !defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 32, 1, 8>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 32, 1, 8>,                4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+
 
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
@@ -65,6 +63,13 @@ using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_instances =
 
     // clang-format on
     >;
+// instances not working on gfx950
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_instances_part2 = std::tuple<
+    // clang-format off
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>
+    // clang-format on
+    >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_instances = std::tuple<
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
old mode 100755
new mode 100644
index e192bf14c5..924ed814ce
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -22,6 +23,14 @@ void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_default_
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_instances<GemmDefault>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_instances_part2<
+                GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instance.cpp
old mode 100755
new mode 100644
index d58ec3eb33..daed25130f
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -22,6 +23,14 @@ void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_kpadding
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_instances<GemmKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_instances_part2<
+                GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_mnkpadding_instance.cpp
new file mode 100755
index 0000000000..a16d3988fe
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_mnkpadding_instance.cpp
@@ -0,0 +1,31 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_mnpadding_instance.cpp
new file mode 100755
index 0000000000..3716b46f6c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_mnpadding_instance.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_instances<GemmMNPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
new file mode 100755
index 0000000000..00ed1698dd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_instances<Intrawave,
+                                                                                GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
new file mode 100755
index 0000000000..bee03061a0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_instances<Interwave,
+                                                                                GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp
old mode 100755
new mode 100644
index aa193417d9..282cea7563
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp
@@ -34,14 +34,11 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(__gfx950__)
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,   128,  16,   2,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v2>
-#else        
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
@@ -57,19 +54,16 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances = st
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   2,   2,  32,   32,    2,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32,  8, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-#endif // defined(__gfx950__)
     // clang-format on
     >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-#if defined(__gfx950__)
-#else        
         // Latency friendly
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
@@ -100,7 +94,6 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances = std
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   8,   4,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   4,   4,  16,   16,    1,    4,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    64,   8,   4,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
-#endif // defined(__gfx950__)
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp
old mode 100755
new mode 100644
index a685c4f252..7335a9851f
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp
@@ -34,13 +34,11 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Compute friendly
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
@@ -58,9 +56,6 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances = st
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  16,   16,    8,    8,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
      
-#if !defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 64, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
@@ -69,16 +64,21 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances = st
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
+// instances not working on gfx950
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances_part2 = std::tuple<
+    // clang-format off
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>
+    // clang-format on
+    >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Latency friendly 
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp
index 546f909b3c..4f048b6525 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -22,6 +23,14 @@ void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_default_ins
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances<GemmDefault>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances_part2<
+                GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp
index d91de96be3..c547487983 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -22,6 +23,14 @@ void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_kpadding_in
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances<GemmKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances_part2<
+                GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
index c70678b449..28afafa26d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -22,6 +23,14 @@ void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnkpadding_
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances<GemmMNKPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances_part2<
+                GemmMNKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp
index 5410a0cc25..e9bbc4d732 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -22,6 +23,14 @@ void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_mnpadding_i
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances<GemmMNPadding>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances_part2<
+                GemmMNPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp
new file mode 100755
index 0000000000..5bf5c01b97
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8   = f8_t;
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmNKPadding  = GemmSpecialization::NKPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_instances = std::tuple<
+// clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        //Only enable these instances on gfx94x
+        // Compute friendly
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  16,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  16,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,  16,   4,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,   128,  16,   8,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  16,   4,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,   128,  16,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  16,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  16,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    64,  16,   4,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    64,  16,   4,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  16,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,    64,   128,  16,   4,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,   128,  16,   4,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+#endif
+    // clang-format on
+    >;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_instances = std::tuple<
+// clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        // Latency friendly 
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    128, 16,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    128, 16,   4,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 2, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    256, 16,   4,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<64, 1, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    512, 16,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<64, 1, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    128, 16,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    256, 16,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<64, 2, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    512, 16,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<64, 2, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        // Memory friendly
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    128, 16,   4,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    128, 16,   4,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    128, 16,   4,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    128, 16,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    128, 16,   4,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 2, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    256, 16,   4,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<64, 1, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    512, 16,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<64, 1, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    128, 16,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    256, 16,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<64, 2, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    512, 16,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<64, 2, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    128, 16,   4,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    128, 16,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    128,  8,   8,  16,   16,    1,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>
+#endif
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..689c2bbbec
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..149b830a83
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instance.cpp
new file mode 100644
index 0000000000..db5082f25c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_instances<GemmNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_default_instance.cpp
new file mode 100644
index 0000000000..cd2ad4f654
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_default_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_instances<Intrawave,
+                                                                            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp
new file mode 100644
index 0000000000..1ed170785b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_instances<Intrawave,
+                                                                            GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_nkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_nkpadding_instance.cpp
new file mode 100644
index 0000000000..9e28c16191
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_nkpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_nkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_instances<Intrawave,
+                                                                            GemmNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_default_instance.cpp
new file mode 100644
index 0000000000..85dc38fbe4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_default_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_instances<Interwave,
+                                                                            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp
new file mode 100644
index 0000000000..2f188ac939
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_instances<Interwave,
+                                                                            GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_nkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_nkpadding_instance.cpp
new file mode 100644
index 0000000000..94684921c7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_nkpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_nkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_instances<Interwave,
+                                                                            GemmNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn.hpp
new file mode 100755
index 0000000000..540b90e54b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn.hpp
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8   = f8_t;
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_instances = std::tuple<
+// clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        // Compute friendly
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  16,  16,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,  16,  16,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  16,  16,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    128, 16,  16,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    128, 16,  16,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    128, 16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    128, 16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    64,  16,  16,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    64,  16,  16,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    128, 16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,    64,    128, 16,  16,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        // DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   128,    128, 16,  16,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,    128, 16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+#endif
+    // clang-format on
+    >;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_instances = std::tuple<
+// clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        // Latency friendly 
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    128, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    256, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    512, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    256, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    512, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        // Memory friendly
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    128, 16,  16,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    128, 16,  16,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,    128, 16,  16,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    128, 16,  16,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   32,    128, 16,  16,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    128, 16,  16,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,     64, 16,  16,  16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    128, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    256, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    512, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    256, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    512, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    128, 16,  16,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   64,    128, 16,  16,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    128, 16,  16,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,  128,    128, 16,  16,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    128, 16,  16,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    128, 16,  16,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>
+#endif
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..df07e21eef
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..22ffb264b7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
new file mode 100644
index 0000000000..d5e84297d9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_instances<Intrawave,
+                                                                            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
new file mode 100644
index 0000000000..314aec027a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_instances<Intrawave,
+                                                                            GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
new file mode 100644
index 0000000000..eb0c871a04
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_instances<Interwave,
+                                                                            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
new file mode 100644
index 0000000000..df92ed71c4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_instances<Interwave,
+                                                                            GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt
index ad430340ea..0ef09c55ee 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt
@@ -7,6 +7,21 @@ add_instance_library(
 	xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
 	xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
 	xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_16_16_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_16_16_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_16_16_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_16_16_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_16_16_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_16_16_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_vec_transpose_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_vec_transpose_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_vec_transpose_instance.cpp
 
        wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instance.cpp
        wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
index 226dca5083..bf775b04c0 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
@@ -8,7 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[g, n, hi, wi, c] * wei[g, k, y, x, c] = in[g, n, ho, wo, k]
+
 void add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   GNHWK,
@@ -26,21 +26,21 @@ void add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_bf16_instances(
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_bf16_instances<2,
-                                                        GNHWK,
-                                                        GKYXC,
-                                                        Empty_Tuple,
-                                                        GNHWC,
-                                                        ConvBwdDataDefault>{});
+        device_grouped_conv_bwd_data_xdl_bf16_16_16_instances<2,
+                                                              GNHWK,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              GNHWC,
+                                                              ConvBwdDataDefault>{});
     // 2. Filter1x1Stride1Pad0
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_bf16_instances<2,
-                                                        GNHWK,
-                                                        GKYXC,
-                                                        Empty_Tuple,
-                                                        GNHWC,
-                                                        ConvBwdDataFilter1x1Stride1Pad0>{});
+        device_grouped_conv_bwd_data_xdl_bf16_16_16_instances<2,
+                                                              GNHWK,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              GNHWC,
+                                                              ConvBwdDataFilter1x1Stride1Pad0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
index 64fbf8bbf2..1a3c80e5cf 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
@@ -8,7 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[g, n, hi, wi, c] * wei[g, k, y, x, c] = in[g, n, ho, wo, k]
+
 void add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   GNHWK,
@@ -26,21 +26,21 @@ void add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f16_instances(
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_f16_instances<2,
-                                                       GNHWK,
-                                                       GKYXC,
-                                                       Empty_Tuple,
-                                                       GNHWC,
-                                                       ConvBwdDataDefault>{});
+        device_grouped_conv_bwd_data_xdl_f16_16_16_instances<2,
+                                                             GNHWK,
+                                                             GKYXC,
+                                                             Empty_Tuple,
+                                                             GNHWC,
+                                                             ConvBwdDataDefault>{});
     // 2. Filter1x1Stride1Pad0
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_f16_instances<2,
-                                                       GNHWK,
-                                                       GKYXC,
-                                                       Empty_Tuple,
-                                                       GNHWC,
-                                                       ConvBwdDataFilter1x1Stride1Pad0>{});
+        device_grouped_conv_bwd_data_xdl_f16_16_16_instances<2,
+                                                             GNHWK,
+                                                             GKYXC,
+                                                             Empty_Tuple,
+                                                             GNHWC,
+                                                             ConvBwdDataFilter1x1Stride1Pad0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
index f9351d96f2..96623a5161 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
@@ -8,7 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[g, n, hi, wi, c] * wei[g, k, y, x, c] = in[g, n, ho, wo, k]
+
 void add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   GNHWK,
@@ -26,21 +26,21 @@ void add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f32_instances(
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_f32_instances<2,
-                                                       GNHWK,
-                                                       GKYXC,
-                                                       Empty_Tuple,
-                                                       GNHWC,
-                                                       ConvBwdDataDefault>{});
+        device_grouped_conv_bwd_data_xdl_f32_16_16_instances<2,
+                                                             GNHWK,
+                                                             GKYXC,
+                                                             Empty_Tuple,
+                                                             GNHWC,
+                                                             ConvBwdDataDefault>{});
     // 2. Filter1x1Stride1Pad0
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_f32_instances<2,
-                                                       GNHWK,
-                                                       GKYXC,
-                                                       Empty_Tuple,
-                                                       GNHWC,
-                                                       ConvBwdDataFilter1x1Stride1Pad0>{});
+        device_grouped_conv_bwd_data_xdl_f32_16_16_instances<2,
+                                                             GNHWK,
+                                                             GKYXC,
+                                                             Empty_Tuple,
+                                                             GNHWC,
+                                                             ConvBwdDataFilter1x1Stride1Pad0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_16_16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_16_16_instance.cpp
new file mode 100644
index 0000000000..f3aded5043
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_16_16_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_bf16_16_16_instances<2,
+                                                              NGKHW,
+                                                              GKCYX,
+                                                              Empty_Tuple,
+                                                              NGCHW,
+                                                              ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
similarity index 64%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_comp_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
index d98b89c556..e8c6bc7cbe 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
@@ -1,36 +1,37 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_comp_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NGCHW,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NGKHW,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
+
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
 {
     add_device_operation_instances(
         instances,
-        device_grouped_conv_fwd_xdl_int8_comp_instances<2,
-                                                        NGCHW,
-                                                        GKYXC,
-                                                        Empty_Tuple,
+        device_grouped_conv_bwd_data_xdl_bf16_instances<2,
                                                         NGKHW,
-                                                        ConvFwdDefault>{});
+                                                        GKCYX,
+                                                        Empty_Tuple,
+                                                        NGCHW,
+                                                        ConvBwdDataDefault>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_vec_transpose_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_vec_transpose_instance.cpp
new file mode 100644
index 0000000000..eed5403914
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_vec_transpose_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_vec_transpose_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_transpose_xdl_bf16_instances<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_16_16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_16_16_instance.cpp
new file mode 100644
index 0000000000..99e54abaa0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_16_16_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f16_16_16_instances<2,
+                                                             NGKHW,
+                                                             GKCYX,
+                                                             Empty_Tuple,
+                                                             NGCHW,
+                                                             ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
new file mode 100644
index 0000000000..3f94d30a55
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f16_instances<2,
+                                                       NGKHW,
+                                                       GKCYX,
+                                                       Empty_Tuple,
+                                                       NGCHW,
+                                                       ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_vec_transpose_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_vec_transpose_instance.cpp
new file mode 100644
index 0000000000..1f04d143b1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_vec_transpose_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_vec_transpose_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_transpose_xdl_f16_instances<2,
+                                                                 NGKHW,
+                                                                 GKCYX,
+                                                                 Empty_Tuple,
+                                                                 NGCHW,
+                                                                 ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_16_16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_16_16_instance.cpp
new file mode 100644
index 0000000000..e84cd1d2d2
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_16_16_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f32_16_16_instances<2,
+                                                             NGKHW,
+                                                             GKCYX,
+                                                             Empty_Tuple,
+                                                             NGCHW,
+                                                             ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
new file mode 100644
index 0000000000..b5e89c9b7c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f32_instances<2,
+                                                       NGKHW,
+                                                       GKCYX,
+                                                       Empty_Tuple,
+                                                       NGCHW,
+                                                       ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_vec_transpose_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_vec_transpose_instance.cpp
new file mode 100644
index 0000000000..b822c82c16
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_vec_transpose_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_vec_transpose_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_transpose_xdl_f32_instances<2,
+                                                                 NGKHW,
+                                                                 GKCYX,
+                                                                 Empty_Tuple,
+                                                                 NGCHW,
+                                                                 ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
new file mode 100644
index 0000000000..5f8f22a47d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkyxc_ngchw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_bf16_generic_instances<2,
+                                                                NGKHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGCHW,
+                                                                ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
new file mode 100644
index 0000000000..e266d79253
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkyxc_ngchw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f16_generic_instances<2,
+                                                               NGKHW,
+                                                               GKYXC,
+                                                               Empty_Tuple,
+                                                               NGCHW,
+                                                               ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
new file mode 100644
index 0000000000..76dd477e06
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkyxc_ngchw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f32_generic_instances<2,
+                                                               NGKHW,
+                                                               GKYXC,
+                                                               Empty_Tuple,
+                                                               NGCHW,
+                                                               ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_16_16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_16_16_instance.cpp
new file mode 100644
index 0000000000..603ff268a0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_16_16_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NHWGK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGC,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_bf16_16_16_instances<2,
+                                                              NHWGK,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              NHWGC,
+                                                              ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_bf16_16_16_instances<2,
+                                                              NHWGK,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              NHWGC,
+                                                              ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
index 5d9194798b..11e0fc6073 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
@@ -8,7 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, hi, wi, g, c] * wei[g, k, y, x, c] = in[n, ho, wo, g, k]
+
 void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   NHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_16_16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_16_16_instance.cpp
new file mode 100644
index 0000000000..a80c33df0b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_16_16_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NHWGK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGC,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f16_16_16_instances<2,
+                                                             NHWGK,
+                                                             GKYXC,
+                                                             Empty_Tuple,
+                                                             NHWGC,
+                                                             ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f16_16_16_instances<2,
+                                                             NHWGK,
+                                                             GKYXC,
+                                                             Empty_Tuple,
+                                                             NHWGC,
+                                                             ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
index 5269bb9652..a63dd712b6 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
@@ -8,7 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, hi, wi, g, c] * wei[g, k, y, x, c] = in[n, ho, wo, g, k]
+
 void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   NHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_16_16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_16_16_instance.cpp
new file mode 100644
index 0000000000..fddcc7c290
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_16_16_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NHWGK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGC,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f32_16_16_instances<2,
+                                                             NHWGK,
+                                                             GKYXC,
+                                                             Empty_Tuple,
+                                                             NHWGC,
+                                                             ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f32_16_16_instances<2,
+                                                             NHWGK,
+                                                             GKYXC,
+                                                             Empty_Tuple,
+                                                             NHWGC,
+                                                             ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
index adfa08c1f7..e4b4165928 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
@@ -8,7 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, hi, wi, g, c] * wei[g, k, y, x, c] = in[n, ho, wo, g, k]
+
 void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   NHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
index ac393560b7..9d5be260cb 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
@@ -1,47 +1,53 @@
 # ONLY XDL_AND_DL_KERNELS
 set(GROUPED_CONV2D_BWD_WEIGHT
-    xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_default_pipev1_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_pad0_pipev1_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_default_pipev1_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_pad0_pipev1_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev2_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev5_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev2_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev5_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev2_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev5_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev2_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev5_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev2_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev5_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev2_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev5_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_irregular_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_irregular_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_irregular_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_irregular_instance.cpp
+    xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+    xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
+    xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp
+    xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_default_pipev1_instance.cpp
+    xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_pad0_pipev1_instance.cpp
+    xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_default_pipev1_instance.cpp
+    xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_pad0_pipev1_instance.cpp
+
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev2_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev5_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev2_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev5_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev2_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev5_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev2_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev5_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev2_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev5_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev2_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev5_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_irregular_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_irregular_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_irregular_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_irregular_instance.cpp
+
+    xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
+    xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
+    xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
+    xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev2_instance.cpp
+    xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev5_instance.cpp    
+    xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev2_instance.cpp
+    xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev5_instance.cpp
+    xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_instance.cpp
+    xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_instance.cpp
+
+    xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
+    xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp
+    xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instance.cpp
     )
 
 if(DL_KERNELS)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_default_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_default_pipev1_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_default_pipev1_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_default_pipev1_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_pad0_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_pad0_pipev1_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_pad0_pipev1_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_pad0_pipev1_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_default_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_default_pipev1_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_default_pipev1_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_default_pipev1_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_pad0_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_pad0_pipev1_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_pad0_pipev1_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_pad0_pipev1_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_instance.cpp
new file mode 100644
index 0000000000..d63cb7375a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKCYX,
+                                                           NGKHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_generic_instances<
+            2,
+            NGCHW,
+            GKCYX,
+            NGKHW,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev2_instance.cpp
similarity index 88%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev2_instance.cpp
index 9fbdc6c461..0f0817b775 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev2_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instances(
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
-                                                           GKYXC,
+                                                           GKCYX,
                                                            NGKHW,
                                                            BF16,
                                                            BF16,
@@ -28,7 +28,7 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_p
         device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_instances<
             2,
             NGCHW,
-            GKYXC,
+            GKCYX,
             NGKHW,
             ConvBwdWeightDefault,
             BlockGemmPipelineScheduler::Intrawave,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev5_instance.cpp
similarity index 88%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev5_instance.cpp
index e1c865a883..8d4ae3dd76 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev5_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instances(
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev5_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
-                                                           GKYXC,
+                                                           GKCYX,
                                                            NGKHW,
                                                            BF16,
                                                            BF16,
@@ -28,7 +28,7 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_p
         device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_instances<
             2,
             NGCHW,
-            GKYXC,
+            GKCYX,
             NGKHW,
             ConvBwdWeightDefault,
             BlockGemmPipelineScheduler::Intrawave,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_instance.cpp
new file mode 100644
index 0000000000..c28de81134
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKCYX,
+                                                           NGKHW,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_generic_instances<
+            2,
+            NGCHW,
+            GKCYX,
+            NGKHW,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev2_instance.cpp
similarity index 88%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev2_instance.cpp
index bbab53d9b5..7efe6f7bc1 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev2_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instances(
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
-                                                           GKYXC,
+                                                           GKCYX,
                                                            NGKHW,
                                                            F16,
                                                            F16,
@@ -28,7 +28,7 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pi
         device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_instances<
             2,
             NGCHW,
-            GKYXC,
+            GKCYX,
             NGKHW,
             ConvBwdWeightDefault,
             BlockGemmPipelineScheduler::Intrawave,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev5_instance.cpp
similarity index 88%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev5_instance.cpp
index b3b5489930..9831e87569 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev5_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instances(
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev5_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
-                                                           GKYXC,
+                                                           GKCYX,
                                                            NGKHW,
                                                            F16,
                                                            F16,
@@ -28,7 +28,7 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pi
         device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_instances<
             2,
             NGCHW,
-            GKYXC,
+            GKCYX,
             NGKHW,
             ConvBwdWeightDefault,
             BlockGemmPipelineScheduler::Intrawave,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
index 2c494f22ff..ff7ca68215 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_bf16_instances(
+void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
-                                                           GKYXC,
+                                                           GKCYX,
                                                            NGKHW,
                                                            BF16,
                                                            BF16,
@@ -27,7 +27,7 @@ void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_bf16_instances(
         instances,
         device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<2,
                                                                     NGCHW,
-                                                                    GKYXC,
+                                                                    GKCYX,
                                                                     NGKHW,
                                                                     ConvBwdWeightDefault,
                                                                     1,
@@ -36,7 +36,7 @@ void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_bf16_instances(
         instances,
         device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<2,
                                                                     NGCHW,
-                                                                    GKYXC,
+                                                                    GKCYX,
                                                                     NGKHW,
                                                                     ConvBwdWeightDefault,
                                                                     4,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
index 911751b0b9..829debe292 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f16_instances(
+void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
-                                                           GKYXC,
+                                                           GKCYX,
                                                            NGKHW,
                                                            F16,
                                                            F16,
@@ -27,7 +27,7 @@ void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f16_instances(
         instances,
         device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<2,
                                                                    NGCHW,
-                                                                   GKYXC,
+                                                                   GKCYX,
                                                                    NGKHW,
                                                                    ConvBwdWeightDefault,
                                                                    1,
@@ -36,7 +36,7 @@ void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f16_instances(
         instances,
         device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<2,
                                                                    NGCHW,
-                                                                   GKYXC,
+                                                                   GKCYX,
                                                                    NGKHW,
                                                                    ConvBwdWeightDefault,
                                                                    4,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
index 3b1fdd9be7..4b4dd38132 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instances(
+void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
-                                                           GKYXC,
+                                                           GKCYX,
                                                            NGKHW,
                                                            F32,
                                                            F32,
@@ -27,7 +27,7 @@ void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instances(
         instances,
         device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<2,
                                                                    NGCHW,
-                                                                   GKYXC,
+                                                                   GKCYX,
                                                                    NGKHW,
                                                                    ConvBwdWeightDefault,
                                                                    1,
@@ -36,7 +36,7 @@ void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instances(
         instances,
         device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<2,
                                                                    NGCHW,
-                                                                   GKYXC,
+                                                                   GKCYX,
                                                                    NGKHW,
                                                                    ConvBwdWeightDefault,
                                                                    4,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instance.cpp
index 0b429af832..e554e7558f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp
index d70c95bf6e..0d16f9bd82 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
new file mode 100644
index 0000000000..da10f9c235
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKYXC,
+                                                           NGKHW,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_generic_instances<2,
+                                                                           NGCHW,
+                                                                           GKYXC,
+                                                                           NGKHW,
+                                                                           ConvBwdWeightDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp
index 74ccc4c89b..6e77488299 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp
index fab2898559..4a0e89f0fe 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_irregular_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_irregular_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_irregular_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp
index 407645e893..9a0da7c431 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_irregular_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_irregular_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_irregular_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp
index 807de66ca5..e2ecee734f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp
index 084c83cd65..a65c20c840 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_irregular_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_irregular_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_irregular_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp
index d174e5b6c0..089953dad2 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_irregular_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_irregular_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_irregular_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev2_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev2_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev5_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev5_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp
index cac9353354..678e5d234f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
similarity index 96%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
index ee71e37e79..54edc0d247 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev2_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev2_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev5_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev5_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev2_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev2_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev5_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev5_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
similarity index 97%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
index f6e1ada352..f77d88e71c 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev2_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev2_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev5_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev5_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev2_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev2_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev5_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev5_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
similarity index 97%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
index 384706414a..e6115f28a1 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev2_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev2_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev5_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev5_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
index 146916cfd3..3a101baac0 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
@@ -15,6 +15,10 @@ add_instance_library(device_grouped_conv2d_fwd_instance
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp
+   # NGCHW, GKCYX, NGKHW
+   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
+   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
+   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
    # large tensor
    # NHWGC, GKYXC, NHWGK
    xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -27,11 +31,10 @@ add_instance_library(device_grouped_conv2d_fwd_instance
    xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.cpp
    xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.cpp
    xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_int8_instance.cpp
-   # NGCHW, GKYXC, NGKHW
-   xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_instance.cpp
-   xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f16_instance.cpp
-   xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f32_instance.cpp
-   xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instance.cpp
+   # NGCHW, GKCYX, NGKHW
+   xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkcyx_ngkhw_bf16_instance.cpp
+   xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkcyx_ngkhw_f16_instance.cpp
+   xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkcyx_ngkhw_f32_instance.cpp
    #mem
    # NHWGC, GKYXC, NHWGK
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
@@ -43,27 +46,32 @@ add_instance_library(device_grouped_conv2d_fwd_instance
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
-   # NGCHW, GKYXC, NGKHW
-   xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instance.cpp
-   xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_intra_instance.cpp
-   xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_intra_instance.cpp
-   xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instance.cpp
-   # NGCHW, GKYXC, NGKHW
-   xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instance.cpp
-   xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_inter_instance.cpp
-   xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_inter_instance.cpp
-   xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instance.cpp
+   # NGCHW, GKCYX, NGKHW
+   xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_mem_intra_instance.cpp
+   # NGCHW, GKCYX, NGKHW
+   xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_mem_inter_instance.cpp
    #comp
    # NHWGC, GKYXC, NHWGK
    xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp
-   # NGCHW, GKYXC, NGKHW
-   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instance.cpp
-   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_comp_instance.cpp
-   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_comp_instance.cpp
-   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_comp_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp
+   # NGCHW, GKCYX, NGKHW
+   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_comp_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_2x_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_2x_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_part2_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_part2_instance.cpp
    #dl
    # GNHWC, GKYXC, GNHWK
    dl/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
@@ -79,8 +87,6 @@ add_instance_library(device_grouped_conv2d_fwd_instance
   wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1p0_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instance.cpp
-  wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp
-  wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp
   ## NHWGC, GKYXC, NHWGK
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_instance.cpp
@@ -88,6 +94,4 @@ add_instance_library(device_grouped_conv2d_fwd_instance
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1p0_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instance.cpp
-  wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp
-  wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp
 )
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp
deleted file mode 100644
index a8f723dfec..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[g, n, hi, wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
-void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                GNHWC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                GNHWK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_wmma_f16_instances<2,
-                                                                              GNHWC,
-                                                                              GKYXC,
-                                                                              Empty_Tuple,
-                                                                              GNHWK,
-                                                                              Empty_Tuple,
-                                                                              PassThrough,
-                                                                              ConvFwdOddC>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp
deleted file mode 100644
index 784a118897..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[g, n, hi, wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
-void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                GNHWC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                GNHWK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_wmma_i8_instances<2,
-                                                                             GNHWC,
-                                                                             GKYXC,
-                                                                             Empty_Tuple,
-                                                                             GNHWK,
-                                                                             Empty_Tuple,
-                                                                             PassThrough,
-                                                                             ConvFwdOddC>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp
deleted file mode 100644
index 8c621543a9..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_wmma_f16_instances<2,
-                                                                              NHWGC,
-                                                                              GKYXC,
-                                                                              Empty_Tuple,
-                                                                              NHWGK,
-                                                                              Empty_Tuple,
-                                                                              PassThrough,
-                                                                              ConvFwdOddC>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp
deleted file mode 100644
index 5cb313b3ca..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_wmma_i8_instances<2,
-                                                                             NHWGC,
-                                                                             GKYXC,
-                                                                             Empty_Tuple,
-                                                                             NHWGK,
-                                                                             Empty_Tuple,
-                                                                             PassThrough,
-                                                                             ConvFwdOddC>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_2x_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_2x_instance.cpp
new file mode 100644
index 0000000000..6cb4ca5652
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_2x_instance.cpp
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
+                                                               NGCHW,
+                                                               GKCYX,
+                                                               Empty_Tuple,
+                                                               NGKHW,
+                                                               ConvFwdDefault>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp
similarity index 87%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp
index 65e233ce08..7368587c93 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp
@@ -1,18 +1,19 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instances(
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 BF16,
@@ -27,7 +28,7 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instances(
         instances,
         device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
                                                         NGCHW,
-                                                        GKYXC,
+                                                        GKCYX,
                                                         Empty_Tuple,
                                                         NGKHW,
                                                         ConvFwdDefault>{});
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_part2_instance.cpp
new file mode 100644
index 0000000000..7f0feb61d8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_part2_instance.cpp
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>&)
+{
+    if(ck::get_device_name() != "gfx950")
+    {
+#if 0 // TODO: Improve compilation time and enable these instances
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
+                                                                  NGCHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGKHW,
+                                                                  ConvFwdDefault>{});
+#endif
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_2x_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_2x_instance.cpp
new file mode 100644
index 0000000000..f9ad6b8212
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_2x_instance.cpp
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
+                                                              NGCHW,
+                                                              GKCYX,
+                                                              Empty_Tuple,
+                                                              NGKHW,
+                                                              ConvFwdDefault>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_instance.cpp
similarity index 87%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_comp_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_instance.cpp
index 36c8e3cb13..803de2de55 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_instance.cpp
@@ -1,18 +1,19 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_comp_instances(
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 F16,
@@ -27,7 +28,7 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_comp_instances(
         instances,
         device_grouped_conv_fwd_xdl_f16_comp_instances<2,
                                                        NGCHW,
-                                                       GKYXC,
+                                                       GKCYX,
                                                        Empty_Tuple,
                                                        NGKHW,
                                                        ConvFwdDefault>{});
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_part2_instance.cpp
new file mode 100644
index 0000000000..da7949668a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_part2_instance.cpp
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>&)
+{
+    if(ck::get_device_name() != "gfx950")
+    {
+#if 0 // TODO: Improve compilation time and enable these instances
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
+                                                                 NGCHW,
+                                                                 GKCYX,
+                                                                 Empty_Tuple,
+                                                                 NGKHW,
+                                                                 ConvFwdDefault>{});
+#endif
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_comp_instance.cpp
similarity index 89%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_comp_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_comp_instance.cpp
index ba3e982e99..3a93c16138 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_comp_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
@@ -9,10 +9,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_comp_instances(
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_comp_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 F32,
@@ -27,7 +27,7 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_comp_instances(
         instances,
         device_grouped_conv_fwd_xdl_f32_comp_instances<2,
                                                        NGCHW,
-                                                       GKYXC,
+                                                       GKCYX,
                                                        Empty_Tuple,
                                                        NGKHW,
                                                        ConvFwdDefault>{});
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
new file mode 100644
index 0000000000..f5df7278d0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
+                                                               NHWGC,
+                                                               GKYXC,
+                                                               Empty_Tuple,
+                                                               NHWGK,
+                                                               ConvFwdDefault>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
+                                                               NHWGC,
+                                                               GKYXC,
+                                                               Empty_Tuple,
+                                                               NHWGK,
+                                                               ConvFwd1x1P0>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
+                                                               NHWGC,
+                                                               GKYXC,
+                                                               Empty_Tuple,
+                                                               NHWGK,
+                                                               ConvFwd1x1S1P0>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
index 9f06347350..db048679bd 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
@@ -1,8 +1,9 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -48,14 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances(
                                                         Empty_Tuple,
                                                         NHWGK,
                                                         ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
-                                                                                   NHWGC,
-                                                                                   GKYXC,
-                                                                                   Empty_Tuple,
-                                                                                   NHWGK,
-                                                                                   ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
new file mode 100644
index 0000000000..ee9507a80a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
+                                                                  NHWGC,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGK,
+                                                                  ConvFwdDefault>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
+                                                                  NHWGC,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGK,
+                                                                  ConvFwd1x1P0>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
+                                                                  NHWGC,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGK,
+                                                                  ConvFwd1x1S1P0>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp
new file mode 100644
index 0000000000..132d3c8411
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              NHWGK,
+                                                              ConvFwdDefault>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              NHWGK,
+                                                              ConvFwd1x1P0>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              NHWGK,
+                                                              ConvFwd1x1S1P0>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp
index 9b1c7ef65e..a7deb969ba 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp
@@ -1,8 +1,9 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -48,14 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances(
                                                        Empty_Tuple,
                                                        NHWGK,
                                                        ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_comp_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp
new file mode 100644
index 0000000000..d2732547fa
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
+                                                                 NHWGC,
+                                                                 GKYXC,
+                                                                 Empty_Tuple,
+                                                                 NHWGK,
+                                                                 ConvFwdDefault>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
+                                                                 NHWGC,
+                                                                 GKYXC,
+                                                                 Empty_Tuple,
+                                                                 NHWGK,
+                                                                 ConvFwd1x1P0>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
+                                                                 NHWGC,
+                                                                 GKYXC,
+                                                                 Empty_Tuple,
+                                                                 NHWGK,
+                                                                 ConvFwd1x1S1P0>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp
index 93e07e08fb..8a0caebc9f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
@@ -48,14 +48,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances(
                                                        Empty_Tuple,
                                                        NHWGK,
                                                        ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_comp_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp
index 78c2257b9a..e45df1e107 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp
@@ -1,8 +1,9 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -49,13 +50,65 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instances(
                                                         NHWGK,
                                                         ConvFwd1x1S1P0>{});
 
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_comp_instances<2,
-                                                                                   NHWGC,
-                                                                                   GKYXC,
-                                                                                   Empty_Tuple,
-                                                                                   NHWGK,
-                                                                                   ConvFwdOddC>{});
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_int8_comp_instances_part2<2,
+                                                                  NHWGC,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGK,
+                                                                  ConvFwdDefault>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_int8_comp_instances_part2<2,
+                                                                  NHWGC,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGK,
+                                                                  ConvFwd1x1P0>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_int8_comp_instances_part2<2,
+                                                                  NHWGC,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGK,
+                                                                  ConvFwd1x1S1P0>{});
+    }
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_int8_comp_instances_2x<2,
+                                                               NHWGC,
+                                                               GKYXC,
+                                                               Empty_Tuple,
+                                                               NHWGK,
+                                                               ConvFwdDefault>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_int8_comp_instances_2x<2,
+                                                               NHWGC,
+                                                               GKYXC,
+                                                               Empty_Tuple,
+                                                               NHWGK,
+                                                               ConvFwd1x1P0>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_int8_comp_instances_2x<2,
+                                                               NHWGC,
+                                                               GKYXC,
+                                                               Empty_Tuple,
+                                                               NHWGK,
+                                                               ConvFwd1x1S1P0>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
index 2afbfdc386..078221f89f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,14 +46,6 @@ void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances(
                                                                               Empty_Tuple,
                                                                               GNHWK,
                                                                               ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
-                                                                              GNHWC,
-                                                                              GKYXC,
-                                                                              Empty_Tuple,
-                                                                              GNHWK,
-                                                                              ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
index 822ef51e00..3a481dd204 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,14 +46,6 @@ void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
                                                                              Empty_Tuple,
                                                                              GNHWK,
                                                                              ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_instances<2,
-                                                                             GNHWC,
-                                                                             GKYXC,
-                                                                             Empty_Tuple,
-                                                                             GNHWK,
-                                                                             ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
index 79a1fb99a8..5add0f8add 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,14 +46,6 @@ void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
                                                                              Empty_Tuple,
                                                                              GNHWK,
                                                                              ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_instances<2,
-                                                                             GNHWC,
-                                                                             GKYXC,
-                                                                             Empty_Tuple,
-                                                                             GNHWK,
-                                                                             ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
new file mode 100644
index 0000000000..6c5d9b5b94
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                                                              NGCHW,
+                                                                              GKCYX,
+                                                                              Empty_Tuple,
+                                                                              NGKHW,
+                                                                              ConvFwdDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
new file mode 100644
index 0000000000..f1ccad2add
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                             NGCHW,
+                                                                             GKCYX,
+                                                                             Empty_Tuple,
+                                                                             NGKHW,
+                                                                             ConvFwdDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
new file mode 100644
index 0000000000..de7e416e48
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_instances<2,
+                                                                             NGCHW,
+                                                                             GKCYX,
+                                                                             Empty_Tuple,
+                                                                             NGKHW,
+                                                                             ConvFwdDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
index 6ee6aa1e4a..28283fb33d 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -23,13 +23,14 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instances(
                                                                 PassThrough,
                                                                 PassThrough>>>& instances)
 {
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
-                                                                              NGCHW,
-                                                                              GKYXC,
-                                                                              Empty_Tuple,
-                                                                              NGKHW,
-                                                                              ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bf16_generic_instances<2,
+                                                           NGCHW,
+                                                           GKYXC,
+                                                           Empty_Tuple,
+                                                           NGKHW,
+                                                           ConvFwdDefault>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
index e002058557..78d1747548 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -23,13 +23,14 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instances(
                                                                 PassThrough,
                                                                 PassThrough>>>& instances)
 {
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_instances<2,
-                                                                             NGCHW,
-                                                                             GKYXC,
-                                                                             Empty_Tuple,
-                                                                             NGKHW,
-                                                                             ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f16_generic_instances<2,
+                                                          NGCHW,
+                                                          GKYXC,
+                                                          Empty_Tuple,
+                                                          NGKHW,
+                                                          ConvFwdDefault>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
index 1033db4972..5c8c3cb8a5 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -23,13 +23,14 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instances(
                                                                 PassThrough,
                                                                 PassThrough>>>& instances)
 {
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_instances<2,
-                                                                             NGCHW,
-                                                                             GKYXC,
-                                                                             Empty_Tuple,
-                                                                             NGKHW,
-                                                                             ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_generic_instances<2,
+                                                          NGCHW,
+                                                          GKYXC,
+                                                          Empty_Tuple,
+                                                          NGKHW,
+                                                          ConvFwdDefault>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp
index 65c75fa043..d89c29327c 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -23,13 +23,14 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instances(
                                                                 PassThrough,
                                                                 PassThrough>>>& instances)
 {
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_instances<2,
-                                                                              NGCHW,
-                                                                              GKYXC,
-                                                                              Empty_Tuple,
-                                                                              NGKHW,
-                                                                              ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_int8_generic_instances<2,
+                                                           NGCHW,
+                                                           GKYXC,
+                                                           Empty_Tuple,
+                                                           NGKHW,
+                                                           ConvFwdDefault>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
index e567c0df75..0257c7d315 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,14 +46,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
                                                                               Empty_Tuple,
                                                                               NHWGK,
                                                                               ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
-                                                                              NHWGC,
-                                                                              GKYXC,
-                                                                              Empty_Tuple,
-                                                                              NHWGK,
-                                                                              ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
index 3e42184996..2715506fe2 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,14 +46,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
                                                                              Empty_Tuple,
                                                                              NHWGK,
                                                                              ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_instances<2,
-                                                                             NHWGC,
-                                                                             GKYXC,
-                                                                             Empty_Tuple,
-                                                                             NHWGK,
-                                                                             ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
index c035d4c3da..8d3e4d91b1 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,14 +46,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
                                                                              Empty_Tuple,
                                                                              NHWGK,
                                                                              ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_instances<2,
-                                                                             NHWGC,
-                                                                             GKYXC,
-                                                                             Empty_Tuple,
-                                                                             NHWGK,
-                                                                             ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp
index 5c425effd8..465fa927a5 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,14 +46,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instances(
                                                                               Empty_Tuple,
                                                                               NHWGK,
                                                                               ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_instances<2,
-                                                                              NHWGC,
-                                                                              GKYXC,
-                                                                              Empty_Tuple,
-                                                                              NHWGK,
-                                                                              ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_mem_inter_instance.cpp
similarity index 92%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_mem_inter_instance.cpp
index 88b5f30da5..d57c67ba07 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_mem_inter_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -9,10 +9,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instances(
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_mem_inter_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 BF16,
@@ -26,7 +26,7 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instance
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
                                                                                   NGCHW,
-                                                                                  GKYXC,
+                                                                                  GKCYX,
                                                                                   Empty_Tuple,
                                                                                   NGKHW,
                                                                                   ConvFwdDefault,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_mem_intra_instance.cpp
similarity index 92%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_mem_intra_instance.cpp
index 48cca9c3f3..484c056bc6 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_mem_intra_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -9,10 +9,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instances(
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_mem_intra_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 BF16,
@@ -26,7 +26,7 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instance
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
                                                                                   NGCHW,
-                                                                                  GKYXC,
+                                                                                  GKCYX,
                                                                                   Empty_Tuple,
                                                                                   NGKHW,
                                                                                   ConvFwdDefault,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_mem_inter_instance.cpp
similarity index 92%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_inter_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_mem_inter_instance.cpp
index 3c467f380e..4d72623313 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_mem_inter_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -9,10 +9,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_inter_instances(
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_mem_inter_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 F16,
@@ -26,7 +26,7 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_inter_instances
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_f16_mem_instances<2,
                                                                                  NGCHW,
-                                                                                 GKYXC,
+                                                                                 GKCYX,
                                                                                  Empty_Tuple,
                                                                                  NGKHW,
                                                                                  ConvFwdDefault,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_mem_intra_instance.cpp
similarity index 92%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_intra_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_mem_intra_instance.cpp
index 49691e862e..20ca169bc0 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_mem_intra_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -9,10 +9,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_intra_instances(
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_mem_intra_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 F16,
@@ -26,7 +26,7 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_intra_instances
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_f16_mem_instances<2,
                                                                                  NGCHW,
-                                                                                 GKYXC,
+                                                                                 GKCYX,
                                                                                  Empty_Tuple,
                                                                                  NGKHW,
                                                                                  ConvFwdDefault,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_mem_inter_instance.cpp
similarity index 92%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_inter_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_mem_inter_instance.cpp
index c54a72a5c5..c1ef3f1012 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_mem_inter_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -9,10 +9,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_inter_instances(
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_mem_inter_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 F32,
@@ -26,7 +26,7 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_inter_instances
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_f32_mem_instances<2,
                                                                                  NGCHW,
-                                                                                 GKYXC,
+                                                                                 GKCYX,
                                                                                  Empty_Tuple,
                                                                                  NGKHW,
                                                                                  ConvFwdDefault,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_mem_intra_instance.cpp
similarity index 92%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_intra_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_mem_intra_instance.cpp
index ed939e0be1..3ecd793cce 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_mem_intra_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -9,10 +9,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_intra_instances(
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_mem_intra_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 F32,
@@ -26,7 +26,7 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_intra_instances
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_f32_mem_instances<2,
                                                                                  NGCHW,
-                                                                                 GKYXC,
+                                                                                 GKCYX,
                                                                                  Empty_Tuple,
                                                                                  NGKHW,
                                                                                  ConvFwdDefault,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instance.cpp
deleted file mode 100644
index 9f0f9371bf..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instance.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NGCHW,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NGKHW,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NGCHW,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NGKHW,
-                                                                                  ConvFwdDefault,
-                                                                                  Interwave>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instance.cpp
deleted file mode 100644
index 217f57d879..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instance.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NGCHW,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NGKHW,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NGCHW,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NGKHW,
-                                                                                  ConvFwdDefault,
-                                                                                  Intrawave>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
index e8a763c527..87423801cb 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,15 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance
                                                                                   NHWGK,
                                                                                   ConvFwd1x1S1P0,
                                                                                   Interwave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdOddC,
-                                                                                  Interwave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
index 3ae3fb5186..ebb213461a 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,15 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance
                                                                                   NHWGK,
                                                                                   ConvFwd1x1S1P0,
                                                                                   Intrawave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdOddC,
-                                                                                  Intrawave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp
index cb7e912936..c2c8a099b2 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,15 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances
                                                                                  NHWGK,
                                                                                  ConvFwd1x1S1P0,
                                                                                  Interwave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
-                                                                                 NHWGC,
-                                                                                 GKYXC,
-                                                                                 Empty_Tuple,
-                                                                                 NHWGK,
-                                                                                 ConvFwdOddC,
-                                                                                 Interwave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp
index d787f4b048..11cb853f0d 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,15 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances
                                                                                  NHWGK,
                                                                                  ConvFwd1x1S1P0,
                                                                                  Intrawave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
-                                                                                 NHWGC,
-                                                                                 GKYXC,
-                                                                                 Empty_Tuple,
-                                                                                 NHWGK,
-                                                                                 ConvFwdOddC,
-                                                                                 Intrawave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp
index 5644289790..1992d7f7c1 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,15 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances
                                                                                  NHWGK,
                                                                                  ConvFwd1x1S1P0,
                                                                                  Interwave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
-                                                                                 NHWGC,
-                                                                                 GKYXC,
-                                                                                 Empty_Tuple,
-                                                                                 NHWGK,
-                                                                                 ConvFwdOddC,
-                                                                                 Interwave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp
index 5b12dad5a3..2b8fd3d9db 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,15 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances
                                                                                  NHWGK,
                                                                                  ConvFwd1x1S1P0,
                                                                                  Intrawave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
-                                                                                 NHWGC,
-                                                                                 GKYXC,
-                                                                                 Empty_Tuple,
-                                                                                 NHWGK,
-                                                                                 ConvFwdOddC,
-                                                                                 Intrawave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
index f667481fa4..5579ec62cc 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,15 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance
                                                                                   NHWGK,
                                                                                   ConvFwd1x1S1P0,
                                                                                   Interwave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdOddC,
-                                                                                  Interwave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
index 2ff2c7f51f..77f3df2c11 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,15 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance
                                                                                   NHWGK,
                                                                                   ConvFwd1x1S1P0,
                                                                                   Intrawave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdOddC,
-                                                                                  Intrawave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkcyx_ngkhw_bf16_instance.cpp
similarity index 93%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkcyx_ngkhw_bf16_instance.cpp
index 14f00d8e88..a8ebcaa6b4 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkcyx_ngkhw_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
@@ -9,10 +9,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_instances(
+void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkcyx_ngkhw_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 BF16,
@@ -27,7 +27,7 @@ void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_inst
         instances,
         device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2,
                                                                  NGCHW,
-                                                                 GKYXC,
+                                                                 GKCYX,
                                                                  Empty_Tuple,
                                                                  NGKHW,
                                                                  ConvFwdDefault>{});
@@ -36,7 +36,7 @@ void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_inst
         instances,
         device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2,
                                                                  NGCHW,
-                                                                 GKYXC,
+                                                                 GKCYX,
                                                                  Empty_Tuple,
                                                                  NGKHW,
                                                                  ConvFwd3x3>{});
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkcyx_ngkhw_f16_instance.cpp
similarity index 93%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkcyx_ngkhw_f16_instance.cpp
index 3ae1ba3d05..5571e11aa0 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkcyx_ngkhw_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
@@ -9,10 +9,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f16_instances(
+void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkcyx_ngkhw_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 F16,
@@ -27,7 +27,7 @@ void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f16_insta
         instances,
         device_grouped_conv_fwd_xdl_merged_groups_f16_instances<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 ConvFwdDefault>{});
@@ -36,7 +36,7 @@ void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f16_insta
         instances,
         device_grouped_conv_fwd_xdl_merged_groups_f16_instances<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 ConvFwd3x3>{});
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkcyx_ngkhw_f32_instance.cpp
similarity index 93%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f32_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkcyx_ngkhw_f32_instance.cpp
index cc570568f3..252b09a1c4 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkcyx_ngkhw_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
@@ -9,10 +9,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f32_instances(
+void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkcyx_ngkhw_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 F32,
@@ -27,7 +27,7 @@ void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f32_insta
         instances,
         device_grouped_conv_fwd_xdl_merged_groups_f32_instances<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 ConvFwdDefault>{});
@@ -36,7 +36,7 @@ void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f32_insta
         instances,
         device_grouped_conv_fwd_xdl_merged_groups_f32_instances<2,
                                                                 NGCHW,
-                                                                GKYXC,
+                                                                GKCYX,
                                                                 Empty_Tuple,
                                                                 NGKHW,
                                                                 ConvFwd3x3>{});
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
index 6fa4bc6e46..5e78cf41d8 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -3,6 +3,7 @@
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -23,23 +24,46 @@ void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_inst
                                                                 PassThrough,
                                                                 PassThrough>>>& instances)
 {
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2,
-                                                                 NHWGC,
-                                                                 GKYXC,
-                                                                 Empty_Tuple,
-                                                                 NHWGK,
-                                                                 ConvFwdDefault>{});
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances_2x<2,
+                                                                        NHWGC,
+                                                                        GKYXC,
+                                                                        Empty_Tuple,
+                                                                        NHWGK,
+                                                                        ConvFwdDefault>{});
 
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2,
-                                                                 NHWGC,
-                                                                 GKYXC,
-                                                                 Empty_Tuple,
-                                                                 NHWGK,
-                                                                 ConvFwd3x3>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances_2x<2,
+                                                                        NHWGC,
+                                                                        GKYXC,
+                                                                        Empty_Tuple,
+                                                                        NHWGK,
+                                                                        ConvFwd3x3>{});
+    }
+    else
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2,
+                                                                     NHWGC,
+                                                                     GKYXC,
+                                                                     Empty_Tuple,
+                                                                     NHWGK,
+                                                                     ConvFwdDefault>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2,
+                                                                     NHWGC,
+                                                                     GKYXC,
+                                                                     Empty_Tuple,
+                                                                     NHWGK,
+                                                                     ConvFwd3x3>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.cpp
index 9fa56f48c7..fb494acd93 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.cpp
@@ -3,6 +3,7 @@
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -23,23 +24,46 @@ void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_insta
                                                                 PassThrough,
                                                                 PassThrough>>>& instances)
 {
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_f16_instances<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                ConvFwdDefault>{});
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances_2x<2,
+                                                                       NHWGC,
+                                                                       GKYXC,
+                                                                       Empty_Tuple,
+                                                                       NHWGK,
+                                                                       ConvFwdDefault>{});
 
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_f16_instances<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                ConvFwd3x3>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances_2x<2,
+                                                                       NHWGC,
+                                                                       GKYXC,
+                                                                       Empty_Tuple,
+                                                                       NHWGK,
+                                                                       ConvFwd3x3>{});
+    }
+    else
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances<2,
+                                                                    NHWGC,
+                                                                    GKYXC,
+                                                                    Empty_Tuple,
+                                                                    NHWGK,
+                                                                    ConvFwdDefault>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances<2,
+                                                                    NHWGC,
+                                                                    GKYXC,
+                                                                    Empty_Tuple,
+                                                                    NHWGK,
+                                                                    ConvFwd3x3>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt
index 29fa8fa3c5..4bb05e5000 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt
@@ -6,6 +6,22 @@ set(GROUPED_CONV3D_BWD_DATA
    xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
    xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
    xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16_16_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16_16_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16_16_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_16_16_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_16_16_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_16_16_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_vec_transpose_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_vec_transpose_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_vec_transpose_instance.cpp
+
    wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
    wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
    wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
index 8331ea1fda..41f0235063 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
@@ -8,7 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = in[g, n, do, ho,
+
 // wo, k]
 void add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
@@ -27,21 +27,21 @@ void add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_bf16_instances(
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_bf16_instances<3,
-                                                        GNDHWK,
-                                                        GKZYXC,
-                                                        Empty_Tuple,
-                                                        GNDHWC,
-                                                        ConvBwdDataDefault>{});
+        device_grouped_conv_bwd_data_xdl_bf16_16_16_instances<3,
+                                                              GNDHWK,
+                                                              GKZYXC,
+                                                              Empty_Tuple,
+                                                              GNDHWC,
+                                                              ConvBwdDataDefault>{});
     // 2. Filter1x1Stride1Pad0
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_bf16_instances<3,
-                                                        GNDHWK,
-                                                        GKZYXC,
-                                                        Empty_Tuple,
-                                                        GNDHWC,
-                                                        ConvBwdDataFilter1x1Stride1Pad0>{});
+        device_grouped_conv_bwd_data_xdl_bf16_16_16_instances<3,
+                                                              GNDHWK,
+                                                              GKZYXC,
+                                                              Empty_Tuple,
+                                                              GNDHWC,
+                                                              ConvBwdDataFilter1x1Stride1Pad0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
index 1885d49c81..03b8285631 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
@@ -8,7 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = in[g, n, do, ho,
+
 // wo, k]
 void add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
@@ -27,21 +27,21 @@ void add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_f16_instances(
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_f16_instances<3,
-                                                       GNDHWK,
-                                                       GKZYXC,
-                                                       Empty_Tuple,
-                                                       GNDHWC,
-                                                       ConvBwdDataDefault>{});
+        device_grouped_conv_bwd_data_xdl_f16_16_16_instances<3,
+                                                             GNDHWK,
+                                                             GKZYXC,
+                                                             Empty_Tuple,
+                                                             GNDHWC,
+                                                             ConvBwdDataDefault>{});
     // 2. Filter1x1Stride1Pad0
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_f16_instances<3,
-                                                       GNDHWK,
-                                                       GKZYXC,
-                                                       Empty_Tuple,
-                                                       GNDHWC,
-                                                       ConvBwdDataFilter1x1Stride1Pad0>{});
+        device_grouped_conv_bwd_data_xdl_f16_16_16_instances<3,
+                                                             GNDHWK,
+                                                             GKZYXC,
+                                                             Empty_Tuple,
+                                                             GNDHWC,
+                                                             ConvBwdDataFilter1x1Stride1Pad0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
index 77135fcc05..59526ba9bc 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
@@ -8,7 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = in[g, n, do, ho,
+
 // wo, k]
 void add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
@@ -27,21 +27,21 @@ void add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_f32_instances(
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_f32_instances<3,
-                                                       GNDHWK,
-                                                       GKZYXC,
-                                                       Empty_Tuple,
-                                                       GNDHWC,
-                                                       ConvBwdDataDefault>{});
+        device_grouped_conv_bwd_data_xdl_f32_16_16_instances<3,
+                                                             GNDHWK,
+                                                             GKZYXC,
+                                                             Empty_Tuple,
+                                                             GNDHWC,
+                                                             ConvBwdDataDefault>{});
     // 2. Filter1x1Stride1Pad0
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_f32_instances<3,
-                                                       GNDHWK,
-                                                       GKZYXC,
-                                                       Empty_Tuple,
-                                                       GNDHWC,
-                                                       ConvBwdDataFilter1x1Stride1Pad0>{});
+        device_grouped_conv_bwd_data_xdl_f32_16_16_instances<3,
+                                                             GNDHWK,
+                                                             GKZYXC,
+                                                             Empty_Tuple,
+                                                             GNDHWC,
+                                                             ConvBwdDataFilter1x1Stride1Pad0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16_16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16_16_instance.cpp
new file mode 100644
index 0000000000..3f90c8b907
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16_16_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NDHWGK,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGC,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_bf16_16_16_instances<3,
+                                                              NDHWGK,
+                                                              GKZYXC,
+                                                              Empty_Tuple,
+                                                              NDHWGC,
+                                                              ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_bf16_16_16_instances<3,
+                                                              NDHWGK,
+                                                              GKZYXC,
+                                                              Empty_Tuple,
+                                                              NDHWGC,
+                                                              ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
index 663d41fe0b..f9989dec13 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
@@ -8,8 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NDHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16_16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16_16_instance.cpp
new file mode 100644
index 0000000000..5862b01852
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16_16_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NDHWGK,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGC,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f16_16_16_instances<3,
+                                                             NDHWGK,
+                                                             GKZYXC,
+                                                             Empty_Tuple,
+                                                             NDHWGC,
+                                                             ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f16_16_16_instances<3,
+                                                             NDHWGK,
+                                                             GKZYXC,
+                                                             Empty_Tuple,
+                                                             NDHWGC,
+                                                             ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
index ac0ab44ce3..071d34b94a 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
@@ -8,8 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NDHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16_16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16_16_instance.cpp
new file mode 100644
index 0000000000..11caf88756
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16_16_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NDHWGK,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGC,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f32_16_16_instances<3,
+                                                             NDHWGK,
+                                                             GKZYXC,
+                                                             Empty_Tuple,
+                                                             NDHWGC,
+                                                             ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f32_16_16_instances<3,
+                                                             NDHWGK,
+                                                             GKZYXC,
+                                                             Empty_Tuple,
+                                                             NDHWGC,
+                                                             ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
index 50d5cce73d..77127bf7f9 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
@@ -8,8 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NDHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_input_f16_comp_bf8_f8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_input_f16_comp_bf8_f8_instance.cpp
index 3f191ab6bc..96873cd87f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_input_f16_comp_bf8_f8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_input_f16_comp_bf8_f8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
@@ -8,8 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_input_f16_comp_bf8f8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NDHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_16_16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_16_16_instance.cpp
new file mode 100644
index 0000000000..fdbfa99f8d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_16_16_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_bf16_16_16_instances<3,
+                                                              NGKDHW,
+                                                              GKCZYX,
+                                                              Empty_Tuple,
+                                                              NGCDHW,
+                                                              ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
new file mode 100644
index 0000000000..943c5bab26
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_bf16_instances<3,
+                                                        NGKDHW,
+                                                        GKCZYX,
+                                                        Empty_Tuple,
+                                                        NGCDHW,
+                                                        ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_vec_transpose_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_vec_transpose_instance.cpp
new file mode 100644
index 0000000000..2a6b11fa1b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_vec_transpose_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_vec_transpose_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_transpose_xdl_bf16_instances<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_16_16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_16_16_instance.cpp
new file mode 100644
index 0000000000..0cc6104e85
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_16_16_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f16_16_16_instances<3,
+                                                             NGKDHW,
+                                                             GKCZYX,
+                                                             Empty_Tuple,
+                                                             NGCDHW,
+                                                             ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
new file mode 100644
index 0000000000..bada2507c2
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f16_instances<3,
+                                                       NGKDHW,
+                                                       GKCZYX,
+                                                       Empty_Tuple,
+                                                       NGCDHW,
+                                                       ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_vec_transpose_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_vec_transpose_instance.cpp
new file mode 100644
index 0000000000..a2eb07261b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_vec_transpose_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_vec_transpose_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_transpose_xdl_f16_instances<3,
+                                                                 NGKDHW,
+                                                                 GKCZYX,
+                                                                 Empty_Tuple,
+                                                                 NGCDHW,
+                                                                 ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_16_16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_16_16_instance.cpp
new file mode 100644
index 0000000000..a4b92ae13a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_16_16_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f32_16_16_instances<3,
+                                                             NGKDHW,
+                                                             GKCZYX,
+                                                             Empty_Tuple,
+                                                             NGCDHW,
+                                                             ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
new file mode 100644
index 0000000000..f1c6f53bf3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f32_instances<3,
+                                                       NGKDHW,
+                                                       GKCZYX,
+                                                       Empty_Tuple,
+                                                       NGCDHW,
+                                                       ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_vec_transpose_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_vec_transpose_instance.cpp
new file mode 100644
index 0000000000..606fe3f360
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_vec_transpose_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_vec_transpose_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_transpose_xdl_f32_instances<3,
+                                                                 NGKDHW,
+                                                                 GKCZYX,
+                                                                 Empty_Tuple,
+                                                                 NGCDHW,
+                                                                 ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp
new file mode 100644
index 0000000000..cbaf934b8a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkzyxc_ngcdhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_bf16_generic_instances<3,
+                                                                NGKDHW,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NGCDHW,
+                                                                ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp
new file mode 100644
index 0000000000..645ee77b05
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkzyxc_ngcdhw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f16_generic_instances<3,
+                                                               NGKDHW,
+                                                               GKZYXC,
+                                                               Empty_Tuple,
+                                                               NGCDHW,
+                                                               ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
new file mode 100644
index 0000000000..97633b618a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkzyxc_ngcdhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f32_generic_instances<3,
+                                                               NGKDHW,
+                                                               GKZYXC,
+                                                               Empty_Tuple,
+                                                               NGCDHW,
+                                                               ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_bilinear/xdl/device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_bilinear/xdl/device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
index c25c481c05..af9ec8365d 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_bilinear/xdl/device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_bilinear/xdl/device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_bilinear_instance.hpp"
@@ -8,8 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgk_gkzyxc_ndhwgc_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NDHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_bilinear/xdl/device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_bilinear/xdl/device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
index f61083e791..71ccd4c5ea 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_bilinear/xdl/device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_bilinear/xdl/device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_bilinear_instance.hpp"
@@ -8,8 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgk_gkzyxc_ndhwgc_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NDHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_bilinear/xdl/device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_bilinear/xdl/device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
index 2e014ae760..a05c130287 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_bilinear/xdl/device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_bilinear/xdl/device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_bilinear_instance.hpp"
@@ -8,8 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgk_gkzyxc_ndhwgc_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NDHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_scale/xdl/device_grouped_conv3d_bwd_data_xdl_scale_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_scale/xdl/device_grouped_conv3d_bwd_data_xdl_scale_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
index af94c0ce9d..8b7bce48bc 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_scale/xdl/device_grouped_conv3d_bwd_data_xdl_scale_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_scale/xdl/device_grouped_conv3d_bwd_data_xdl_scale_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_scale_instance.hpp"
@@ -8,8 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_scale_ndhwgk_gkzyxc_ndhwgc_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NDHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_scale/xdl/device_grouped_conv3d_bwd_data_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_scale/xdl/device_grouped_conv3d_bwd_data_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
index cc8995320a..890679e7e7 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_scale/xdl/device_grouped_conv3d_bwd_data_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_scale/xdl/device_grouped_conv3d_bwd_data_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_scale_instance.hpp"
@@ -8,8 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_scale_ndhwgk_gkzyxc_ndhwgc_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NDHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_scale/xdl/device_grouped_conv3d_bwd_data_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_scale/xdl/device_grouped_conv3d_bwd_data_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
index 5ed7962bbc..0a22063347 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_scale/xdl/device_grouped_conv3d_bwd_data_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_scale/xdl/device_grouped_conv3d_bwd_data_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_scale_instance.hpp"
@@ -8,8 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_scale_ndhwgk_gkzyxc_ndhwgc_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NDHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
index 860e08cafe..1b0d2dd0b2 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
@@ -1,43 +1,49 @@
  # XDL_DL_WMMA_KERNELS
 set(GROUPED_CONV3D_BWD_WEIGHT
-     xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev2_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev5_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev2_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev5_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev2_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev5_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev2_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev5_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev2_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev5_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev5_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_irregular_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_irregular_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_irregular_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_irregular_instance.cpp
+     xdl/gndhwc_gkzyxc_gndhwk/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
+     xdl/gndhwc_gkzyxc_gndhwk/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
+     xdl/gndhwc_gkzyxc_gndhwk/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp
+
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev2_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev5_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev2_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev5_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev2_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev5_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev2_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev5_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev2_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev5_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev5_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_irregular_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_irregular_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_irregular_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_irregular_instance.cpp
+
+     xdl/ngcdhw_gkzyxc_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
+     xdl/ngcdhw_gkzyxc_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp
+     xdl/ngcdhw_gkzyxc_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp
+
+     xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
+     xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
+     xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
+     xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev2_instance.cpp
+     xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev5_instance.cpp
+     xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev2_instance.cpp
+     xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev5_instance.cpp
+     xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_instance.cpp
+     xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_instance.cpp
     )
 
 if(DL_KERNELS)
@@ -62,7 +68,7 @@ list(APPEND GROUPED_CONV3D_BWD_WEIGHT
 
 if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "bf8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
     list(APPEND GROUPED_CONV3D_BWD_WEIGHT
-      xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp)
+      xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp)
 endif()
 
 add_instance_library(device_grouped_conv3d_bwd_weight_instance ${GROUPED_CONV3D_BWD_WEIGHT})
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/gndhwc_gkzyxc_gndhwk/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/gndhwc_gkzyxc_gndhwk/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/gndhwc_gkzyxc_gndhwk/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/gndhwc_gkzyxc_gndhwk/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/gndhwc_gkzyxc_gndhwk/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/gndhwc_gkzyxc_gndhwk/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp
index 63249a1c13..4c4589d128 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp
index 7841ddad99..b6d8c7f635 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_irregular_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_irregular_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_irregular_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp
index ba6285a380..5b295e728b 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_irregular_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_irregular_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_irregular_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp
index a8fbefb5bd..125b324985 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp
index e4baafc0be..beb937f185 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_irregular_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_irregular_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_irregular_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp
index f9bc5b1349..5274ff74a0 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_irregular_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_irregular_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_irregular_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev2_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev2_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev5_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev5_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp
index 679f30a3d9..767e091b94 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
similarity index 96%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
index f1ea371819..53011b4972 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev2_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev2_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev5_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev5_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp
similarity index 97%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp
index 6e7f22b7e5..8a1e0b2008 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev2_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev2_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev5_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev5_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
similarity index 97%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
index eba721c7b8..d23b8516ca 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev2_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev2_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev5_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev5_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev2_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev2_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev5_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev5_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
similarity index 97%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
index 7dd289139c..4de221a885 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev5_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev5_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_instance.cpp
new file mode 100644
index 0000000000..e7cfcf1e5f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
+                                                           NGKDHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_generic_instances<
+            3,
+            NGCDHW,
+            GKCZYX,
+            NGKDHW,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev2_instance.cpp
similarity index 89%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev2_instance.cpp
index ac6cb82681..8d9c3c56ed 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev2_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instances(
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
-                                                           GKZYXC,
+                                                           GKCZYX,
                                                            NGKDHW,
                                                            BF16,
                                                            BF16,
@@ -28,7 +28,7 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf1
         device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_instances<
             3,
             NGCDHW,
-            GKZYXC,
+            GKCZYX,
             NGKDHW,
             ConvBwdWeightDefault,
             BlockGemmPipelineScheduler::Intrawave,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev5_instance.cpp
similarity index 89%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev5_instance.cpp
index 705f5e8cef..b8036bab91 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev5_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instances(
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev5_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
-                                                           GKZYXC,
+                                                           GKCZYX,
                                                            NGKDHW,
                                                            BF16,
                                                            BF16,
@@ -28,7 +28,7 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf1
         device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_instances<
             3,
             NGCDHW,
-            GKZYXC,
+            GKCZYX,
             NGKDHW,
             ConvBwdWeightDefault,
             BlockGemmPipelineScheduler::Intrawave,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_instance.cpp
new file mode 100644
index 0000000000..f22b0c74c0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
+                                                           NGKDHW,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_generic_instances<
+            3,
+            NGCDHW,
+            GKCZYX,
+            NGKDHW,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev2_instance.cpp
similarity index 89%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev2_instance.cpp
index 489fa81a7f..c8c6253362 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev2_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instances(
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
-                                                           GKZYXC,
+                                                           GKCZYX,
                                                            NGKDHW,
                                                            F16,
                                                            F16,
@@ -28,7 +28,7 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16
         device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_instances<
             3,
             NGCDHW,
-            GKZYXC,
+            GKCZYX,
             NGKDHW,
             ConvBwdWeightDefault,
             BlockGemmPipelineScheduler::Intrawave,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev5_instance.cpp
similarity index 89%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev5_instance.cpp
index f9eadcec5b..c69a182271 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev5_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instances(
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev5_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
-                                                           GKZYXC,
+                                                           GKCZYX,
                                                            NGKDHW,
                                                            F16,
                                                            F16,
@@ -28,7 +28,7 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16
         device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_instances<
             3,
             NGCDHW,
-            GKZYXC,
+            GKCZYX,
             NGKDHW,
             ConvBwdWeightDefault,
             BlockGemmPipelineScheduler::Intrawave,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
similarity index 93%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
index fd558f70ca..20e516dbe1 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instances(
+void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
-                                                           GKZYXC,
+                                                           GKCZYX,
                                                            NGKDHW,
                                                            BF16,
                                                            BF16,
@@ -27,7 +27,7 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance
         instances,
         device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<3,
                                                                     NGCDHW,
-                                                                    GKZYXC,
+                                                                    GKCZYX,
                                                                     NGKDHW,
                                                                     ConvBwdWeightDefault,
                                                                     1,
@@ -36,7 +36,7 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance
         instances,
         device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<3,
                                                                     NGCDHW,
-                                                                    GKZYXC,
+                                                                    GKCZYX,
                                                                     NGKDHW,
                                                                     ConvBwdWeightDefault,
                                                                     4,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
similarity index 93%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
index 945e6fa563..90dc8f858f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instances(
+void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
-                                                           GKZYXC,
+                                                           GKCZYX,
                                                            NGKDHW,
                                                            F16,
                                                            F16,
@@ -27,7 +27,7 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instances
         instances,
         device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<3,
                                                                    NGCDHW,
-                                                                   GKZYXC,
+                                                                   GKCZYX,
                                                                    NGKDHW,
                                                                    ConvBwdWeightDefault,
                                                                    1,
@@ -36,7 +36,7 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instances
         instances,
         device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<3,
                                                                    NGCDHW,
-                                                                   GKZYXC,
+                                                                   GKCZYX,
                                                                    NGKDHW,
                                                                    ConvBwdWeightDefault,
                                                                    4,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
similarity index 93%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
index b7e6454062..7550dcf7fd 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instances(
+void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
-                                                           GKZYXC,
+                                                           GKCZYX,
                                                            NGKDHW,
                                                            F32,
                                                            F32,
@@ -27,7 +27,7 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instances
         instances,
         device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<3,
                                                                    NGCDHW,
-                                                                   GKZYXC,
+                                                                   GKCZYX,
                                                                    NGKDHW,
                                                                    ConvBwdWeightDefault,
                                                                    1,
@@ -36,7 +36,7 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instances
         instances,
         device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<3,
                                                                    NGCDHW,
-                                                                   GKZYXC,
+                                                                   GKCZYX,
                                                                    NGKDHW,
                                                                    ConvBwdWeightDefault,
                                                                    4,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkzyxc_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkzyxc_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp
index 16221eb3e7..35669e6752 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkzyxc_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkzyxc_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkzyxc_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp
index 126e90f2ce..2e0aeabca1 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkzyxc_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkzyxc_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkzyxc_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
new file mode 100644
index 0000000000..b21e4fdef6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkzyxc_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKZYXC,
+                                                           NGKDHW,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_generic_instances<3,
+                                                                           NGCDHW,
+                                                                           GKZYXC,
+                                                                           NGKDHW,
+                                                                           ConvBwdWeightDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
index 1e572f9ceb..eeea4aae6d 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
@@ -8,6 +8,9 @@ set(GROUPED_CONV3D_FWD
    xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
+   xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
+   xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
+   xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
 
    xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
    xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
@@ -16,18 +19,34 @@ set(GROUPED_CONV3D_FWD
    xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
    xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
    xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+   xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
+   xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
+   xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
 
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp
 
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp
    
    xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
    xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp
    xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_2x_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_2x_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_part2_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_part2_instance.cpp
 
    wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
    wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp
@@ -41,10 +60,6 @@ set(GROUPED_CONV3D_FWD
    wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp
    wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp
    wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp
-   wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp
-   wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp
-   wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp
-   wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp
 )
 
 if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp
deleted file mode 100644
index fa378af1ee..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = out[g, n, do, ho,
-// wo, k]
-void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                GNDHWC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                GNDHWK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_wmma_f16_instances<3,
-                                                                              GNDHWC,
-                                                                              GKZYXC,
-                                                                              Empty_Tuple,
-                                                                              GNDHWK,
-                                                                              Empty_Tuple,
-                                                                              PassThrough,
-                                                                              ConvFwdOddC>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp
deleted file mode 100644
index d41416fd4a..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = out[g, n, do, ho,
-// wo, k]
-void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                GNDHWC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                GNDHWK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_wmma_i8_instances<3,
-                                                                             GNDHWC,
-                                                                             GKZYXC,
-                                                                             Empty_Tuple,
-                                                                             GNDHWK,
-                                                                             Empty_Tuple,
-                                                                             PassThrough,
-                                                                             ConvFwdOddC>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp
deleted file mode 100644
index 8a7bc26178..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = out[n, do, ho, wo,
-// g, k]
-void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_wmma_f16_instances<3,
-                                                                              NDHWGC,
-                                                                              GKZYXC,
-                                                                              Empty_Tuple,
-                                                                              NDHWGK,
-                                                                              Empty_Tuple,
-                                                                              PassThrough,
-                                                                              ConvFwdOddC>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp
deleted file mode 100644
index 7649f86971..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = out[n, do, ho, wo,
-// g, k]
-void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_wmma_i8_instances<3,
-                                                                             NDHWGC,
-                                                                             GKZYXC,
-                                                                             Empty_Tuple,
-                                                                             NDHWGK,
-                                                                             Empty_Tuple,
-                                                                             PassThrough,
-                                                                             ConvFwdOddC>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
index efc4640603..a94f687ef8 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
@@ -3,6 +3,7 @@
 
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -46,6 +47,62 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances(
                                                         Empty_Tuple,
                                                         NDHWGK,
                                                         ConvFwd1x1S1P0>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                                  NDHWGC,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGK,
+                                                                  ConvFwdDefault>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                                  NDHWGC,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGK,
+                                                                  ConvFwd1x1P0>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                                  NDHWGC,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGK,
+                                                                  ConvFwd1x1S1P0>{});
+    }
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               Empty_Tuple,
+                                                               NDHWGK,
+                                                               ConvFwdDefault>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               Empty_Tuple,
+                                                               NDHWGK,
+                                                               ConvFwd1x1P0>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               Empty_Tuple,
+                                                               NDHWGK,
+                                                               ConvFwd1x1S1P0>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp
index 3f3cd4b7d2..0c63345e7f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp
@@ -3,6 +3,7 @@
 
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -46,6 +47,62 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances(
                                                        Empty_Tuple,
                                                        NDHWGK,
                                                        ConvFwd1x1S1P0>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                 NDHWGC,
+                                                                 GKZYXC,
+                                                                 Empty_Tuple,
+                                                                 NDHWGK,
+                                                                 ConvFwdDefault>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                 NDHWGC,
+                                                                 GKZYXC,
+                                                                 Empty_Tuple,
+                                                                 NDHWGK,
+                                                                 ConvFwd1x1P0>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                 NDHWGC,
+                                                                 GKZYXC,
+                                                                 Empty_Tuple,
+                                                                 NDHWGK,
+                                                                 ConvFwd1x1S1P0>{});
+    }
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              Empty_Tuple,
+                                                              NDHWGK,
+                                                              ConvFwdDefault>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              Empty_Tuple,
+                                                              NDHWGK,
+                                                              ConvFwd1x1P0>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              Empty_Tuple,
+                                                              NDHWGK,
+                                                              ConvFwd1x1S1P0>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_2x_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_2x_instance.cpp
new file mode 100644
index 0000000000..3e1a2dd48b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_2x_instance.cpp
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                               NGCDHW,
+                                                               GKCZYX,
+                                                               Empty_Tuple,
+                                                               NGKDHW,
+                                                               ConvFwdDefault>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp
new file mode 100644
index 0000000000..43241454a5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                        NGCDHW,
+                                                        GKCZYX,
+                                                        Empty_Tuple,
+                                                        NGKDHW,
+                                                        ConvFwdDefault>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                                                   NGCDHW,
+                                                                                   GKCZYX,
+                                                                                   Empty_Tuple,
+                                                                                   NGKDHW,
+                                                                                   ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                        NGCDHW,
+                                                        GKCZYX,
+                                                        Empty_Tuple,
+                                                        NGKDHW,
+                                                        ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_part2_instance.cpp
new file mode 100644
index 0000000000..85a1c9137c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_part2_instance.cpp
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>&)
+{
+    if(ck::get_device_name() != "gfx950")
+    {
+#if 0 // TODO: Improve compilation time and enable these instances
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                                 NGCDHW,
+                                                                 GKCZYX,
+                                                                 Empty_Tuple,
+                                                                 NGKDHW,
+                                                                 ConvFwdDefault>{});
+#endif
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_2x_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_2x_instance.cpp
new file mode 100644
index 0000000000..9b8bf4fa42
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_2x_instance.cpp
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                              NGCDHW,
+                                                              GKCZYX,
+                                                              Empty_Tuple,
+                                                              NGKDHW,
+                                                              ConvFwdDefault>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp
new file mode 100644
index 0000000000..d02d9f6778
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                       NGCDHW,
+                                                       GKCZYX,
+                                                       Empty_Tuple,
+                                                       NGKDHW,
+                                                       ConvFwdDefault>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                                                  NGCDHW,
+                                                                                  GKCZYX,
+                                                                                  Empty_Tuple,
+                                                                                  NGKDHW,
+                                                                                  ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                       NGCDHW,
+                                                       GKCZYX,
+                                                       Empty_Tuple,
+                                                       NGKDHW,
+                                                       ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_part2_instance.cpp
new file mode 100644
index 0000000000..eaac75ee9e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_part2_instance.cpp
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>&)
+{
+    if(ck::get_device_name() != "gfx950")
+    {
+#if 0 // TODO: Improve compilation time and enable these instances
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                 NGCDHW,
+                                                                 GKCZYX,
+                                                                 Empty_Tuple,
+                                                                 NGKDHW,
+                                                                 ConvFwdDefault>{});
+#endif
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instance.cpp
new file mode 100644
index 0000000000..696ea7f34e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instance.cpp
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_comp_instances<3,
+                                                       NGCDHW,
+                                                       GKCZYX,
+                                                       Empty_Tuple,
+                                                       NGKDHW,
+                                                       ConvFwdDefault>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_comp_instances<3,
+                                                                                  NGCDHW,
+                                                                                  GKCZYX,
+                                                                                  Empty_Tuple,
+                                                                                  NGKDHW,
+                                                                                  ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_comp_instances<3,
+                                                       NGCDHW,
+                                                       GKCZYX,
+                                                       Empty_Tuple,
+                                                       NGKDHW,
+                                                       ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
new file mode 100644
index 0000000000..060eebebc1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                              NGCDHW,
+                                                                              GKCZYX,
+                                                                              Empty_Tuple,
+                                                                              NGKDHW,
+                                                                              ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                              NGCDHW,
+                                                                              GKCZYX,
+                                                                              Empty_Tuple,
+                                                                              NGKDHW,
+                                                                              ConvFwd1x1P0>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                              NGCDHW,
+                                                                              GKCZYX,
+                                                                              Empty_Tuple,
+                                                                              NGKDHW,
+                                                                              ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
new file mode 100644
index 0000000000..85b088f416
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                             NGCDHW,
+                                                                             GKCZYX,
+                                                                             Empty_Tuple,
+                                                                             NGKDHW,
+                                                                             ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                             NGCDHW,
+                                                                             GKCZYX,
+                                                                             Empty_Tuple,
+                                                                             NGKDHW,
+                                                                             ConvFwd1x1P0>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                             NGCDHW,
+                                                                             GKCZYX,
+                                                                             Empty_Tuple,
+                                                                             NGKDHW,
+                                                                             ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
new file mode 100644
index 0000000000..2b3e596355
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_instances<3,
+                                                                             NGCDHW,
+                                                                             GKCZYX,
+                                                                             Empty_Tuple,
+                                                                             NGKDHW,
+                                                                             ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_instances<3,
+                                                                             NGCDHW,
+                                                                             GKCZYX,
+                                                                             Empty_Tuple,
+                                                                             NGKDHW,
+                                                                             ConvFwd1x1P0>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_instances<3,
+                                                                             NGCDHW,
+                                                                             GKCZYX,
+                                                                             Empty_Tuple,
+                                                                             NGKDHW,
+                                                                             ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp
new file mode 100644
index 0000000000..fac3098341
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NGCDHW,
+                                                                                  GKCZYX,
+                                                                                  Empty_Tuple,
+                                                                                  NGKDHW,
+                                                                                  ConvFwdDefault,
+                                                                                  Interwave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NGCDHW,
+                                                                                  GKCZYX,
+                                                                                  Empty_Tuple,
+                                                                                  NGKDHW,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Interwave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NGCDHW,
+                                                                                  GKCZYX,
+                                                                                  Empty_Tuple,
+                                                                                  NGKDHW,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Interwave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp
new file mode 100644
index 0000000000..f3eccc7dc8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NGCDHW,
+                                                                                  GKCZYX,
+                                                                                  Empty_Tuple,
+                                                                                  NGKDHW,
+                                                                                  ConvFwdDefault,
+                                                                                  Intrawave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NGCDHW,
+                                                                                  GKCZYX,
+                                                                                  Empty_Tuple,
+                                                                                  NGKDHW,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Intrawave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NGCDHW,
+                                                                                  GKCZYX,
+                                                                                  Empty_Tuple,
+                                                                                  NGKDHW,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Intrawave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp
new file mode 100644
index 0000000000..abea0bea81
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwdDefault,
+                                                                                 Interwave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Interwave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Interwave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp
new file mode 100644
index 0000000000..ba5d9fb1de
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwdDefault,
+                                                                                 Intrawave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Intrawave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Intrawave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp
new file mode 100644
index 0000000000..5a2c4a0d5b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwdDefault,
+                                                                                 Interwave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Interwave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Interwave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp
new file mode 100644
index 0000000000..701b8eb4a4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwdDefault,
+                                                                                 Intrawave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Intrawave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Intrawave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
similarity index 81%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
index c66d48ed7a..71bde2faa5 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
@@ -1,44 +1,43 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NGCHW,
-                                                                GKYXC,
+
+void add_device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
                                                                 Empty_Tuple,
-                                                                NGKHW,
-                                                                int8_t,
-                                                                int8_t,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
                                                                 Empty_Tuple,
-                                                                int8_t,
+                                                                BF16,
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances)
 {
     add_device_operation_instances(
         instances,
-        device_grouped_conv_fwd_xdl_merged_groups_int8_instances<2,
-                                                                 NGCHW,
-                                                                 GKYXC,
+        device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<3,
+                                                                 NGCDHW,
+                                                                 GKCZYX,
                                                                  Empty_Tuple,
-                                                                 NGKHW,
+                                                                 NGKDHW,
                                                                  ConvFwdDefault>{});
-
     add_device_operation_instances(
         instances,
-        device_grouped_conv_fwd_xdl_merged_groups_int8_instances<2,
-                                                                 NGCHW,
-                                                                 GKYXC,
+        device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<3,
+                                                                 NGCDHW,
+                                                                 GKCZYX,
                                                                  Empty_Tuple,
-                                                                 NGKHW,
+                                                                 NGKDHW,
                                                                  ConvFwd3x3>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
new file mode 100644
index 0000000000..2e71b76256
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f16_instances<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f16_instances<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                ConvFwd3x3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
new file mode 100644
index 0000000000..8a53dea612
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                ConvFwd3x3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_irregular_instance.cpp
index 0bd53706be..20f19bd774 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_irregular_instance.cpp
@@ -30,13 +30,11 @@ using Empty_Tuple = ck::Tuple<>;
 using PassThrough                    = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
 using device_grouped_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_irregular_tile_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //################################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,    F8,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,    F8,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,    F8,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   192,    64,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn.hpp
index baf04cf12e..839d3559f7 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn.hpp
@@ -39,19 +39,31 @@ static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
 static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
+// double rate mfma instances on gfx950
+template <typename DsLayout,
+          typename DsDataType,
+          typename CDEElementwiseOp,
+          GemmSpecialization GemmSpec = GemmMNKPadding>
+using device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances_2x = std::tuple<
+    // clang-format off
+        //###########################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|                C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //###########################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise|      Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //###########################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //###########################################|       |       |            |       |      |      |        |         |            |      |            |            |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |   S<C,D0...,D_N| 
+        DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,   128,   128,    128,  16,   4,   32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,               S<1, 32, 1, 8>,        S<8,8,8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>
+    // clang-format on
+    >;
+
 template <typename DsLayout,
           typename DsDataType,
           typename CDEElementwiseOp,
           GemmSpecialization GemmSpec = GemmMNKPadding>
 using device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //###########################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|                C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //###########################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise|      Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //###########################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //###########################################|       |       |            |       |      |      |        |         |            |      |            |            |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |   S<C,D0...,D_N| 
-#if defined(CK_USE_AMD_MFMA_GFX950)
-        DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,   128,   128,    128,  16,   4,   32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,               S<1, 32, 1, 8>,        S<8,8,8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,   256,   256,    32,   8,   4,   32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,               S<1, 32, 1, 8>,        S<8,8,1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,   128,   128,    64,   8,   4,   32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,               S<1, 32, 1, 8>,        S<8,8,1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,   256,   256,    32,   8,   4,   32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,               S<1, 32, 1, 8>,        S<8,8,1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
@@ -70,13 +82,11 @@ template <typename DsLayout,
           BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave>
 using device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances =
     std::tuple<
-// clang-format off
+        // clang-format off
         //###########################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|                C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //###########################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise|      Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //###########################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //###########################################|       |       |            |       |      |      |        |         |            |      |            |            |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |   S<C,D0...,D_N| 
-#if defined(CK_USE_AMD_MFMA_GFX950)
-#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Latency friendly
         // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,    64,    16,    16,   256,   8,   4,   16,   16,    1,    1,     S<32, 2, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<64, 1, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,          0,          1,           1,               S<1, 16, 1, 4>,        S<4,4,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   128,    16,    32,   256,   8,   4,   16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<64, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,          0,          1,           1,               S<1, 16, 1, 8>,        S<4,4,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_comp_default_instance.cpp
index 6848774431..220cb8031f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_comp_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_comp_default_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -27,6 +28,17 @@ void add_device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_comp_d
                                                                                ck::Tuple<BF16>,
                                                                                Multiply,
                                                                                GemmDefault>{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances_2x<
+                ck::Tuple<Row>,
+                ck::Tuple<BF16>,
+                Multiply,
+                GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_comp_kpadding_instance.cpp
index bb2ea76aa4..6604078013 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_comp_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_comp_kpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -27,6 +28,17 @@ void add_device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_comp_k
                                                                                ck::Tuple<BF16>,
                                                                                Multiply,
                                                                                GemmKPadding>{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances_2x<
+                ck::Tuple<Row>,
+                ck::Tuple<BF16>,
+                Multiply,
+                GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
index 7439433f8a..157b45d713 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -27,6 +28,17 @@ void add_device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_comp_m
                                                                                ck::Tuple<BF16>,
                                                                                Multiply,
                                                                                GemmMNKPadding>{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances_2x<
+                ck::Tuple<Row>,
+                ck::Tuple<BF16>,
+                Multiply,
+                GemmMNKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
index b3afed0fd7..ca7e774ccd 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -27,6 +28,17 @@ void add_device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_comp_m
                                                                                ck::Tuple<BF16>,
                                                                                Multiply,
                                                                                GemmMNPadding>{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances_2x<
+                ck::Tuple<Row>,
+                ck::Tuple<BF16>,
+                Multiply,
+                GemmMNPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/profiler/include/profiler/profile_gemm_multiply_multiply_weight_preshuffle_impl.hpp b/profiler/include/profiler/profile_gemm_multiply_multiply_wp_impl.hpp
similarity index 99%
rename from profiler/include/profiler/profile_gemm_multiply_multiply_weight_preshuffle_impl.hpp
rename to profiler/include/profiler/profile_gemm_multiply_multiply_wp_impl.hpp
index 177e652cc3..c76387e2b0 100644
--- a/profiler/include/profiler/profile_gemm_multiply_multiply_weight_preshuffle_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_multiply_multiply_wp_impl.hpp
@@ -12,7 +12,7 @@
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_b_preshuffle.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
-#include "ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle.hpp"
+#include "ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply_wp.hpp"
 
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
diff --git a/profiler/include/profiler/profile_gemm_universal_impl.hpp b/profiler/include/profiler/profile_gemm_universal_impl.hpp
index 2054ffbbb3..f7b1d5f1f8 100644
--- a/profiler/include/profiler/profile_gemm_universal_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_impl.hpp
@@ -9,7 +9,7 @@
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 #include "ck/library/tensor_operation_instance/gpu/gemm_universal.hpp"
diff --git a/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp b/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
old mode 100644
new mode 100755
index 72194e8e61..e625fae808
--- a/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
@@ -11,6 +11,7 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp"
 
 #include "ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp"
 
@@ -20,12 +21,14 @@
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/reference_tensor_operation/gpu/reference_gemm.hpp"
 
 namespace ck {
 namespace profiler {
 
 template <typename ADataType,
           typename BDataType,
+          typename ComputeDataType,
           typename AccDataType,
           typename CDataType,
           typename ALayout,
@@ -66,7 +69,9 @@ bool profile_gemm_universal_streamk_impl(int do_verification,
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
     Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
     Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_ref_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
 
     int total_gemm_needed = a_m_k.GetElementSpaceSizeInBytes() + b_k_n.GetElementSpaceSizeInBytes();
     int rotating_count    = std::max(
@@ -103,6 +108,9 @@ bool profile_gemm_universal_streamk_impl(int do_verification,
     DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
     DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
 
+    DeviceMem c_m_n_device_ref_buf(sizeof(CDataType) *
+                                   c_m_n_device_ref_result.mDesc.GetElementSpaceSize());
+
     a_device_buf.ToDevice(a_m_k.mData.data());
     b_device_buf.ToDevice(b_k_n.mData.data());
 
@@ -125,21 +133,22 @@ bool profile_gemm_universal_streamk_impl(int do_verification,
     // Run reference GEMM
     if(do_verification)
     {
-        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
-                                                                                BDataType,
-                                                                                CDataType,
-                                                                                AccDataType,
-                                                                                AElementOp,
-                                                                                BElementOp,
-                                                                                CElementOp>;
 
-        auto ref_gemm    = ReferenceGemmInstance{};
-        auto ref_invoker = ref_gemm.MakeInvoker();
-
-        auto ref_argument = ref_gemm.MakeArgument(
+        // Use CPU validation
+        // Note: GPU validation is not supported for fp8 !!!
+        using ReferenceGemmInstanceCPU = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                   BDataType,
+                                                                                   CDataType,
+                                                                                   AccDataType,
+                                                                                   AElementOp,
+                                                                                   BElementOp,
+                                                                                   CElementOp,
+                                                                                   ComputeDataType>;
+        auto ref_gemm_cpu              = ReferenceGemmInstanceCPU{};
+        auto ref_invoker_cpu           = ref_gemm_cpu.MakeInvoker();
+        auto ref_argument_cpu          = ref_gemm_cpu.MakeArgument(
             a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
-
-        ref_invoker.Run(ref_argument);
+        ref_invoker_cpu.Run(ref_argument_cpu);
     }
 
     std::string best_op_name;
@@ -203,6 +212,7 @@ bool profile_gemm_universal_streamk_impl(int do_verification,
                     {
                         c_device_buf.FromDevice(c_m_n_device_result.mData.data());
 
+                        // Always compare against CPU reference results computed earlier
                         pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
 
                         if(do_log)
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
index 93d3430bba..4e0ced347d 100644
--- a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -34,7 +34,8 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
                                         int init_method,
                                         bool do_log,
                                         bool time_kernel,
-                                        const ck::utils::conv::ConvParam& conv_param)
+                                        const ck::utils::conv::ConvParam& conv_param,
+                                        ck::index_t split_k = 1)
 {
     using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
     using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -88,6 +89,7 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
     // reset input to zero
     in_device_buf.SetZero();
 
+    float max_accumulated_value = 0;
     if(do_verification)
     {
         auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdData<NDimSpatial,
@@ -114,17 +116,24 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
                                                   in_element_op);
 
         ref_invoker.Run(ref_argument);
+        max_accumulated_value = *std::max_element(in_host.mData.begin(), in_host.mData.end());
     }
 
     std::string best_op_name;
-    float best_avg_time   = 0;
-    float best_tflops     = 0;
-    float best_gb_per_sec = 0;
+    float best_avg_time      = 0;
+    float best_tflops        = 0;
+    float best_gb_per_sec    = 0;
+    ck::index_t best_split_k = 1;
 
     // profile device op instances
     bool pass = true;
 
-    auto run_impl = [&](auto& op_ptr, auto& argument_ptr) {
+    auto run_impl = [&](auto& op_ptr, auto& argument_ptr, const index_t& split_k_for_run) {
+        // workspace_sz will be equal to 0 for other layout than NGCHW
+        const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+        DeviceMem workspace_dev(workspace_sz);
+        op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
             // re-init output to zero before profiling next kernel
@@ -145,7 +154,8 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
             float gb_per_sec = num_btype / 1.E6 / avg_time;
 
             std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
-                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+                      << gb_per_sec << " GB/s, " << op_name << ", SplitK " << split_k_for_run
+                      << std::endl;
 
             if(tflops > best_tflops)
             {
@@ -153,13 +163,39 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
                 best_tflops     = tflops;
                 best_avg_time   = avg_time;
                 best_gb_per_sec = gb_per_sec;
+                best_split_k    = split_k_for_run;
             }
 
             if(do_verification)
             {
                 in_device_buf.FromDevice(in_device.mData.data());
 
-                pass = pass & ck::utils::check_err(in_device, in_host);
+                using ComputeType = std::conditional_t<sizeof(OutDataType) < sizeof(WeiDataType),
+                                                       OutDataType,
+                                                       WeiDataType>;
+                using AccDataType =
+                    std::conditional_t<std::is_same_v<ComputeType, int8_t>, int32_t, float>;
+                const index_t num_accums = conv_param.K_;
+                // Calculate thresholds
+                auto rtol = ck::utils::get_relative_threshold<ComputeType, InDataType, AccDataType>(
+                    num_accums / split_k_for_run);
+                auto atol = ck::utils::get_absolute_threshold<ComputeType, InDataType, AccDataType>(
+                    max_accumulated_value / split_k_for_run, num_accums / split_k_for_run);
+                // Calculate error due to split_k accumulation
+                auto rtol_split_k =
+                    ck::utils::get_relative_threshold<InDataType, InDataType, InDataType>(
+                        split_k_for_run);
+                auto atol_split_k =
+                    ck::utils::get_absolute_threshold<InDataType, InDataType, InDataType>(
+                        max_accumulated_value, split_k_for_run);
+                // Use higher threshold
+                rtol = std::max(rtol, rtol_split_k);
+                atol = std::max(atol, atol_split_k);
+
+                pass = pass & ck::utils::check_err(
+                                  in_device, in_host, "Error: Incorrect results!", rtol, atol);
+                std::cout << "Relative error threshold: " << rtol
+                          << " Absolute error threshold: " << atol << std::endl;
 
                 if(do_log)
                 {
@@ -220,35 +256,47 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
     copy(conv_param.input_left_pads_, input_left_pads);
     copy(conv_param.input_right_pads_, input_right_pads);
 
+    std::vector<ck::index_t> split_k_list = {1, 2, 4, 8, 16, 32, 64, 128};
+
+    if(split_k > 0)
+    {
+        split_k_list = {split_k};
+    }
+
     for(auto& op_ptr : op_ptrs)
     {
-        auto argument_ptr =
-            op_ptr->MakeArgumentPointer(static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-                                        static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
-                                        {},
-                                        static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
-                                        out_lengths,
-                                        out_strides,
-                                        wei_lengths,
-                                        wei_strides,
-                                        {},
-                                        {},
-                                        in_lengths,
-                                        in_strides,
-                                        conv_filter_strides,
-                                        conv_filter_dilations,
-                                        input_left_pads,
-                                        input_right_pads,
-                                        out_element_op,
-                                        wei_element_op,
-                                        in_element_op);
+        for(std::size_t split_k_id = 0; split_k_id < split_k_list.size(); split_k_id++)
+        {
+            auto argument_ptr = op_ptr->MakeArgumentPointer(
+                static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+                {},
+                static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+                out_lengths,
+                out_strides,
+                wei_lengths,
+                wei_strides,
+                {},
+                {},
+                in_lengths,
+                in_strides,
+                conv_filter_strides,
+                conv_filter_dilations,
+                input_left_pads,
+                input_right_pads,
+                out_element_op,
+                wei_element_op,
+                in_element_op,
+                split_k_list[split_k_id]);
 
-        run_impl(op_ptr, argument_ptr);
+            run_impl(op_ptr, argument_ptr, split_k_list[split_k_id]);
+        }
     }
 
     std::cout << "Best configuration parameters:"
               << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
-              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
+              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << ", SplitK "
+              << best_split_k << std::endl;
 
     return pass;
 }
diff --git a/profiler/include/profiler/profile_grouped_gemm_fixed_nk_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_fixed_nk_impl.hpp
index 09e03de99c..8fb20f0135 100644
--- a/profiler/include/profiler/profile_grouped_gemm_fixed_nk_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_fixed_nk_impl.hpp
@@ -6,6 +6,7 @@
 #include <iomanip>
 
 #include "ck/ck.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
diff --git a/profiler/include/profiler/profile_grouped_gemm_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
index 367e94de11..fc2ba5a650 100644
--- a/profiler/include/profiler/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
@@ -1,11 +1,12 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
 #include <iomanip>
 
 #include "ck/ck.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_gemm.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp"
diff --git a/profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp
index 94ee2a37e4..1b17f05760 100644
--- a/profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp
@@ -6,6 +6,7 @@
 #include <iomanip>
 
 #include "ck/ck.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/host_utility/hip_check_error.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_gemm_tile_loop.hpp"
diff --git a/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp
index 3a4ca24dda..cf3c3a6bae 100644
--- a/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp
@@ -6,6 +6,7 @@
 #include <iomanip>
 
 #include "ck/ck.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/host_utility/hip_check_error.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_gemm_tile_loop.hpp"
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
index 5ed28b9826..17c8c277eb 100644
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -50,7 +50,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
   list(APPEND PROFILER_SOURCES profile_gemm_multiply_add.cpp)
   if(SUPPORTED_GPU_TARGETS MATCHES "gfx94")
     list(APPEND PROFILER_SOURCES profile_gemm_multiply_multiply.cpp)
-    list(APPEND PROFILER_SOURCES profile_gemm_multiply_multiply_weight_preshuffle.cpp)
+    list(APPEND PROFILER_SOURCES profile_gemm_multiply_multiply_wp.cpp)
     list(APPEND PROFILER_SOURCES profile_gemm_ab_scale.cpp)
   endif()
   list(APPEND PROFILER_SOURCES profile_batched_gemm.cpp)
@@ -58,7 +58,6 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
   list(APPEND PROFILER_SOURCES profile_gemm_add_multiply.cpp)
   list(APPEND PROFILER_SOURCES profile_gemm_bias_add_reduce.cpp)
   list(APPEND PROFILER_SOURCES profile_gemm_splitk.cpp)
-  list(APPEND PROFILER_SOURCES profile_gemm_universal.cpp)
   list(APPEND PROFILER_SOURCES profile_gemm_b_scale.cpp)
   list(APPEND PROFILER_SOURCES profile_batched_gemm_b_scale.cpp)
   list(APPEND PROFILER_SOURCES profile_gemm_universal_batched.cpp)
@@ -76,6 +75,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12
   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
     list(APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp)
   endif()
+  list(APPEND PROFILER_SOURCES profile_gemm_universal.cpp)
   list(APPEND PROFILER_SOURCES profile_grouped_conv_fwd.cpp)
   list(APPEND PROFILER_SOURCES profile_grouped_conv_bwd_data.cpp)
   list(APPEND PROFILER_SOURCES profile_grouped_conv_bwd_weight.cpp)
@@ -140,11 +140,10 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance)
   if(SUPPORTED_GPU_TARGETS MATCHES "gfx94")
     target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_multiply_instance)
-    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_multiply_weight_preshuffle_instance)
+    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_multiply_wp_instance)
     target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_ab_scale_instance)
   endif()
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_instance)
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_b_scale_instance)
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_b_scale_instance)
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_batched_instance)
@@ -170,6 +169,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx11"
   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
     target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance)
   endif()
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_instance)
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_instance)
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance)
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
diff --git a/profiler/src/profile_gemm_ab_scale.cpp b/profiler/src/profile_gemm_ab_scale.cpp
index 56c8b5e7a1..3956038a30 100644
--- a/profiler/src/profile_gemm_ab_scale.cpp
+++ b/profiler/src/profile_gemm_ab_scale.cpp
@@ -32,6 +32,7 @@ enum struct GemmDataType
 enum struct ScaleBlockTile
 {
     Tile_128_128_128, // 0
+    Tile_1_128_128,   // 1
 };
 
 #define OP_NAME "gemm_ab_scale"
@@ -49,7 +50,8 @@ int profile_gemm_ab_scale(int argc, char* argv[])
         printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
         printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
         printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
-        printf("arg4: scale block tile (0: ScaleBlockM/N/K = [128, 128, 128];\n");
+        printf("arg4: scale block tile (0: ScaleBlockM/N/K = [128, 128, 128]; 1: ScaleBlockM/N/K = "
+               "[1, 128, 128];\n");
         printf("arg5: verification (0: no; 1: yes)\n");
         printf("arg6: initialization (0: no init; 1: integer value; 2: decimal value)\n");
         printf("arg7: print tensor value (0: no; 1: yes)\n");
@@ -155,7 +157,7 @@ int profile_gemm_ab_scale(int argc, char* argv[])
     };
 
     if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_NK_MN &&
-       scale_block_tile == ScaleBlockTile::Tile_128_128_128)
+       scale_block_tile == ScaleBlockTile::Tile_1_128_128)
     {
         return profile(F8{},
                        F32{},
@@ -164,7 +166,7 @@ int profile_gemm_ab_scale(int argc, char* argv[])
                        F8{},
                        F32{},
                        BF16{},
-                       ck::Number<128>{},
+                       ck::Number<1>{},
                        ck::Number<128>{},
                        ck::Number<128>{},
                        Row{},
diff --git a/profiler/src/profile_gemm_multiply_multiply.cpp b/profiler/src/profile_gemm_multiply_multiply.cpp
index ad2bb77544..42192b5985 100644
--- a/profiler/src/profile_gemm_multiply_multiply.cpp
+++ b/profiler/src/profile_gemm_multiply_multiply.cpp
@@ -42,7 +42,7 @@ int profile_gemm_multiply_multiply(int argc, char* argv[])
         printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
         printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: "
                "f16->f8; 7: f8->bf16, "
-               "comp f8; 8: int8->bf16; 9: f8->f16, comp f8;)\n");
+               "comp f8; 8: int8->bf16; 9: int8->f16, 10. f8->f16;)\n");
         printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
         printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
         printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
diff --git a/profiler/src/profile_gemm_multiply_multiply_weight_preshuffle.cpp b/profiler/src/profile_gemm_multiply_multiply_wp.cpp
similarity index 98%
rename from profiler/src/profile_gemm_multiply_multiply_weight_preshuffle.cpp
rename to profiler/src/profile_gemm_multiply_multiply_wp.cpp
index ee3be398e5..ff6cffb5f2 100644
--- a/profiler/src/profile_gemm_multiply_multiply_weight_preshuffle.cpp
+++ b/profiler/src/profile_gemm_multiply_multiply_wp.cpp
@@ -6,7 +6,7 @@
 #include <initializer_list>
 #include <cstdlib>
 
-#include "profiler/profile_gemm_multiply_multiply_weight_preshuffle_impl.hpp"
+#include "profiler/profile_gemm_multiply_multiply_wp_impl.hpp"
 #include "profiler_operation_registry.hpp"
 
 enum struct GemmMatrixLayout
diff --git a/profiler/src/profile_gemm_universal.cpp b/profiler/src/profile_gemm_universal.cpp
index a22d983da5..7f2393a7e6 100644
--- a/profiler/src/profile_gemm_universal.cpp
+++ b/profiler/src/profile_gemm_universal.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <initializer_list>
@@ -103,8 +103,10 @@ int profile_gemm_universal(int argc, char* argv[])
     using F32  = float;
     using F16  = ck::half_t;
     using BF16 = ck::bhalf_t;
-#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94) || defined(CK_USE_WMMA_FP8)
     using F8 = ck::f8_t;
+#endif
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
     using I4 = ck::pk_i4_t;
 #endif
 
@@ -201,7 +203,7 @@ int profile_gemm_universal(int argc, char* argv[])
     {
         return profile(BF16{}, BF16{}, BF16{}, F32{}, BF16{}, Col{}, Row{}, Row{});
     }
-#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94) || defined(CK_USE_WMMA_FP8)
     else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
     {
         return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Row{}, Row{});
@@ -210,6 +212,8 @@ int profile_gemm_universal(int argc, char* argv[])
     {
         return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Col{}, Row{});
     }
+#endif
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
     else if(data_type == GemmDataType::F16_I4_F16 && layout == GemmMatrixLayout::MK_NK_MN)
     {
         return profile(F16{}, I4{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
diff --git a/profiler/src/profile_gemm_universal_streamk.cpp b/profiler/src/profile_gemm_universal_streamk.cpp
old mode 100644
new mode 100755
index b0f66a0c73..4d1ab811ee
--- a/profiler/src/profile_gemm_universal_streamk.cpp
+++ b/profiler/src/profile_gemm_universal_streamk.cpp
@@ -26,6 +26,7 @@ enum struct GemmDataType
     F8_F16_F16,     // 4
     F16_F8_F16,     // 5
     F16_F16_F16_F8, // 6
+    F8_F8_BF16,     // 7
 };
 
 #define OP_NAME "gemm_universal_streamk"
@@ -37,7 +38,7 @@ int profile_gemm_universal_streamk(int argc, char* argv[])
     {
         printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
         printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: f16, "
-               "comp f8)\n");
+               "comp f8;  7: f8->bf16,)\n");
         printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
         printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
         printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
@@ -112,15 +113,17 @@ int profile_gemm_universal_streamk(int argc, char* argv[])
 
     auto profile = [&](auto a_type,
                        auto b_type,
+                       auto comp_type,
                        auto acc_type,
                        auto c_type,
                        auto a_layout,
                        auto b_layout,
                        auto c_layout) {
-        using ADataType   = decltype(a_type);
-        using BDataType   = decltype(b_type);
-        using AccDataType = decltype(acc_type);
-        using CDataType   = decltype(c_type);
+        using ADataType       = decltype(a_type);
+        using BDataType       = decltype(b_type);
+        using ComputeDataType = decltype(comp_type);
+        using AccDataType     = decltype(acc_type);
+        using CDataType       = decltype(c_type);
 
         using ALayout = decltype(a_layout);
         using BLayout = decltype(b_layout);
@@ -132,6 +135,7 @@ int profile_gemm_universal_streamk(int argc, char* argv[])
 
         bool pass = ck::profiler::profile_gemm_universal_streamk_impl<ADataType,
                                                                       BDataType,
+                                                                      ComputeDataType,
                                                                       AccDataType,
                                                                       CDataType,
                                                                       ALayout,
@@ -158,46 +162,56 @@ int profile_gemm_universal_streamk(int argc, char* argv[])
 
     if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
     {
-        return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
+        return profile(F16{}, F16{}, F32{}, F32{}, F16{}, Row{}, Row{}, Row{});
     }
     else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
     {
-        return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
+        return profile(F16{}, F16{}, F32{}, F32{}, F16{}, Row{}, Col{}, Row{});
     }
 #if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
     else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
     {
-        return profile(F16{}, F8{}, F32{}, F16{}, Row{}, Row{}, Row{});
+        return profile(F16{}, F8{}, F32{}, F32{}, F16{}, Row{}, Row{}, Row{});
     }
     else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_NK_MN)
     {
-        return profile(F16{}, F8{}, F32{}, F16{}, Row{}, Col{}, Row{});
+        return profile(F16{}, F8{}, F32{}, F32{}, F16{}, Row{}, Col{}, Row{});
     }
     else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
     {
-        return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
+        return profile(F8{}, F16{}, F32{}, F32{}, F16{}, Row{}, Row{}, Row{});
     }
     else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
     {
-        return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
+        return profile(F8{}, F16{}, F32{}, F32{}, F16{}, Row{}, Col{}, Row{});
     }
 #endif
     else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
     {
-        return profile(BF16{}, BF16{}, F32{}, BF16{}, Row{}, Row{}, Row{});
+        return profile(BF16{}, BF16{}, F32{}, F32{}, BF16{}, Row{}, Row{}, Row{});
     }
     else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
     {
-        return profile(BF16{}, BF16{}, F32{}, BF16{}, Row{}, Col{}, Row{});
+        return profile(BF16{}, BF16{}, F32{}, F32{}, BF16{}, Row{}, Col{}, Row{});
     }
     else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::KM_KN_MN)
     {
-        return profile(BF16{}, BF16{}, F32{}, BF16{}, Col{}, Row{}, Row{});
+        return profile(BF16{}, BF16{}, F32{}, F32{}, BF16{}, Col{}, Row{}, Row{});
     }
     else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::KM_NK_MN)
     {
-        return profile(BF16{}, BF16{}, F32{}, BF16{}, Col{}, Col{}, Row{});
+        return profile(BF16{}, BF16{}, F32{}, F32{}, BF16{}, Col{}, Col{}, Row{});
     }
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
+    else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Col{}, Row{});
+    }
+#endif
     else
     {
         std::cout << "this data_type & layout is not implemented" << std::endl;
diff --git a/profiler/src/profile_grouped_conv_bwd_data.cpp b/profiler/src/profile_grouped_conv_bwd_data.cpp
index 55d199317a..5cdece499e 100644
--- a/profiler/src/profile_grouped_conv_bwd_data.cpp
+++ b/profiler/src/profile_grouped_conv_bwd_data.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -15,6 +15,8 @@ enum struct ConvLayout
 {
     GNHWC_GKYXC_GNHWK, // 0
     NHWGC_GKYXC_NHWGK, // 1
+    NGCHW_GKYXC_NGKHW, // 2
+    NGCHW_GKCYX_NGKHW, // 3
 };
 
 enum struct ConvDataType
@@ -35,8 +37,10 @@ static void print_helper_msg()
         << "arg2: data type (0: Output fp32, Weight fp32, Input fp32\n"
         << "                 1: Output fp16, Weight fp16, Input fp16\n"
         << "                 2: Output bf16, Weight bf16, Input bf16\n"
-        << "arg3: tensor layout (0: Output[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Input[G, N, Ho, Wo, K]\n"
-        << "                     1: Output[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Input[N, Ho, Wo, G, K])\n"
+        << "arg3: tensor layout (0: Output[G, N, Ho, Wo, C], Weight[G, K, Y, X, C], Input[G, N, Hi, Wi, K]\n"
+        << "                     1: Output[N, Ho, Wo, G, C], Weight[G, K, Y, X, C], Input[N, Hi, Wi, G, K])\n"
+        << "                     2: Output[N, G, C, Ho, Wo], Weight[G, K, Y, X, C], Input[N, G, K, Hi, Wi])\n"
+        << "                     3: Output[N, G, C, Ho, Wo], Weight[G, K, C, Y, X], Input[N, G, K, Hi, Wi])\n"
         << "arg4: verification (0: no, 1: yes)\n"
         << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
         << "arg6: print tensor value (0: no; 1: yes)\n"
@@ -64,8 +68,8 @@ int profile_grouped_conv_bwd_data(int argc, char* argv[])
     const bool time_kernel     = std::stoi(argv[7]);
     const int num_dim_spatial  = std::stoi(argv[8]);
 
-    // 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
-    if(argc != 8 + 1 + 4 + 6 * num_dim_spatial)
+    // 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial, 1 for split-K
+    if(argc != 8 + 1 + 4 + 6 * num_dim_spatial + 1)
     {
         print_helper_msg();
         return 1;
@@ -73,6 +77,8 @@ int profile_grouped_conv_bwd_data(int argc, char* argv[])
 
     const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv);
 
+    ck::index_t split_k = std::stoi(argv[8 + 1 + 4 + 6 * num_dim_spatial]);
+
     using F32  = float;
     using F16  = ck::half_t;
     using BF16 = ck::bhalf_t;
@@ -106,7 +112,7 @@ int profile_grouped_conv_bwd_data(int argc, char* argv[])
                                                                      OutDataType,
                                                                      WeiDataType,
                                                                      InDataType>(
-            do_verification, init_method, do_log, time_kernel, params);
+            do_verification, init_method, do_log, time_kernel, params, split_k);
 
         return pass ? 0 : 1;
     };
@@ -143,6 +149,36 @@ int profile_grouped_conv_bwd_data(int argc, char* argv[])
                 return profile(I2, NHWGK{}, GKYXC{}, NHWGC{}, BF16{}, BF16{}, BF16{});
             }
         }
+        else if(layout == ConvLayout::NGCHW_GKYXC_NGKHW)
+        {
+            if(data_type == ConvDataType::F32_F32_F32)
+            {
+                return profile(I2, NGKHW{}, GKYXC{}, NGCHW{}, F32{}, F32{}, F32{});
+            }
+            else if(data_type == ConvDataType::F16_F16_F16)
+            {
+                return profile(I2, NGKHW{}, GKYXC{}, NGCHW{}, F16{}, F16{}, F16{});
+            }
+            else if(data_type == ConvDataType::BF16_BF16_BF16)
+            {
+                return profile(I2, NGKHW{}, GKYXC{}, NGCHW{}, BF16{}, BF16{}, BF16{});
+            }
+        }
+        else if(layout == ConvLayout::NGCHW_GKCYX_NGKHW)
+        {
+            if(data_type == ConvDataType::F32_F32_F32)
+            {
+                return profile(I2, NGKHW{}, GKCYX{}, NGCHW{}, F32{}, F32{}, F32{});
+            }
+            else if(data_type == ConvDataType::F16_F16_F16)
+            {
+                return profile(I2, NGKHW{}, GKCYX{}, NGCHW{}, F16{}, F16{}, F16{});
+            }
+            else if(data_type == ConvDataType::BF16_BF16_BF16)
+            {
+                return profile(I2, NGKHW{}, GKCYX{}, NGCHW{}, BF16{}, BF16{}, BF16{});
+            }
+        }
     }
     else if(num_dim_spatial == 3)
     {
@@ -176,6 +212,36 @@ int profile_grouped_conv_bwd_data(int argc, char* argv[])
                 return profile(I3, NDHWGK{}, GKZYXC{}, NDHWGC{}, BF16{}, BF16{}, BF16{});
             }
         }
+        else if(layout == ConvLayout::NGCHW_GKYXC_NGKHW)
+        {
+            if(data_type == ConvDataType::F32_F32_F32)
+            {
+                return profile(I3, NGKDHW{}, GKZYXC{}, NGCDHW{}, F32{}, F32{}, F32{});
+            }
+            else if(data_type == ConvDataType::F16_F16_F16)
+            {
+                return profile(I3, NGKDHW{}, GKZYXC{}, NGCDHW{}, F16{}, F16{}, F16{});
+            }
+            else if(data_type == ConvDataType::BF16_BF16_BF16)
+            {
+                return profile(I3, NGKDHW{}, GKZYXC{}, NGCDHW{}, BF16{}, BF16{}, BF16{});
+            }
+        }
+        else if(layout == ConvLayout::NGCHW_GKYXC_NGKHW)
+        {
+            if(data_type == ConvDataType::F32_F32_F32)
+            {
+                return profile(I3, NGKDHW{}, GKCZYX{}, NGCDHW{}, F32{}, F32{}, F32{});
+            }
+            else if(data_type == ConvDataType::F16_F16_F16)
+            {
+                return profile(I3, NGKDHW{}, GKCZYX{}, NGCDHW{}, F16{}, F16{}, F16{});
+            }
+            else if(data_type == ConvDataType::BF16_BF16_BF16)
+            {
+                return profile(I3, NGKDHW{}, GKCZYX{}, NGCDHW{}, BF16{}, BF16{}, BF16{});
+            }
+        }
     }
 
     std::cout << "this data_type & layout is not implemented" << std::endl;
diff --git a/profiler/src/profile_grouped_conv_bwd_weight.cpp b/profiler/src/profile_grouped_conv_bwd_weight.cpp
index 4170ac65aa..1640b48ffd 100644
--- a/profiler/src/profile_grouped_conv_bwd_weight.cpp
+++ b/profiler/src/profile_grouped_conv_bwd_weight.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <initializer_list>
@@ -17,6 +17,7 @@ enum struct ConvLayout
     GNHWC_GKYXC_GNHWK, // 1
     NHWGC_GKYXC_NHWGK, // 2
     NGCHW_GKYXC_NGKHW, // 3
+    NGCHW_GKCYX_NGKHW, // 4
 };
 
 enum struct ConvDataType
@@ -49,6 +50,8 @@ static void print_helper_msg()
                  "Ho, Wo, G, K]\n"
               << "                     3: Input[N, G, C, Hi, Wi], Weight[G, K, Y, X, C], Output[N, "
                  "G, K, Ho, Wo]\n"
+              << "                     4: Input[N, G, C, Hi, Wi], Weight[G, K, C, Y, X], Output[N, "
+                 "G, K, Ho, Wo]\n"
               << "arg4: verification (0: no, 1: yes)\n"
               << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
               << "arg6: print tensor value (0: no; 1: yes)\n"
@@ -199,6 +202,21 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
             return profile(I2, NGCHW{}, GKYXC{}, NGKHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
         }
     }
+    else if(num_dim_spatial == 2 && layout == ConvLayout::NGCHW_GKCYX_NGKHW)
+    {
+        if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I2, NGCHW{}, GKCYX{}, NGKHW{}, F16{}, F16{}, F16{}, F16{}, F16{});
+        }
+        if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(I2, NGCHW{}, GKCYX{}, NGKHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
+        }
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I2, NGCHW{}, GKCYX{}, NGKHW{}, F32{}, F32{}, F32{}, F32{}, F32{});
+        }
+    }
     if(num_dim_spatial == 3 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
     {
         if(data_type == ConvDataType::F32_F32_F32)
@@ -262,6 +280,22 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
                 I3, NGCDHW{}, GKZYXC{}, NGKDHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
         }
     }
+    else if(num_dim_spatial == 3 && layout == ConvLayout::NGCHW_GKCYX_NGKHW)
+    {
+        if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, F16{}, F16{}, F16{}, F16{}, F16{});
+        }
+        if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(
+                I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
+        }
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, F32{}, F32{}, F32{}, F32{}, F32{});
+        }
+    }
 
     std::cout << "this data_type & layout is not implemented" << std::endl;
 
diff --git a/profiler/src/profile_grouped_conv_fwd.cpp b/profiler/src/profile_grouped_conv_fwd.cpp
index 7faf573dbf..a7714b4c73 100644
--- a/profiler/src/profile_grouped_conv_fwd.cpp
+++ b/profiler/src/profile_grouped_conv_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -16,6 +16,7 @@ enum struct ConvLayout
     GNHWC_GKYXC_GNHWK, // 0
     NHWGC_GKYXC_NHWGK, // 1
     NGCHW_GKYXC_NGKHW, // 2
+    NGCHW_GKCYX_NGKHW, // 3
 };
 
 enum struct ConvDataType
@@ -52,11 +53,13 @@ static void print_helper_msg()
         << "                 5: Input bf8, Weight bf8, Output fp8\n"
         << "                 6: Input fp8, Weight bf8, Output fp8\n"
         << "                 7: Input bf8, Weight fp8, Output fp8)\n"
-        << "arg3: indexing data type (0: 32-bit, 1: 64-bit)\n"
-        << "arg4: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]\n"
-        << "                     1: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K])\n"
+        << "arg3: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]\n"
+        << "                     1: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K]\n"
         << "                     2: Input[N, G, C, Hi, Wi], Weight[G, K, Y, X, C], Output[N, "
-            "G, K, Ho, Wo]\n"
+        "G, K, Ho, Wo]\n"
+        << "                     3: Input[N, G, C, Hi, Wi], Weight[G, K, C, Y, X], Output[N, "
+        "G, K, Ho, Wo])\n"
+        << "arg4: indexing data type (0: 32-bit, 1: 64-bit)\n"
         << "arg5: verification (0: no, 1: yes)\n"
         << "arg6: initialization (0: no init, 1: integer value, 2: decimal value)\n"
         << "arg7: print tensor value (0: no; 1: yes)\n"
@@ -110,14 +113,20 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
     using GKYXC  = ck::tensor_layout::convolution::GKYXC;
     using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
 
+    // using GKCX   = ck::tensor_layout::convolution::GKXC;
+    using GKCYX  = ck::tensor_layout::convolution::GKCYX;
+    using GKCZYX = ck::tensor_layout::convolution::GKCZYX;
+
     using GNWK   = ck::tensor_layout::convolution::GNWK;
     using GNHWK  = ck::tensor_layout::convolution::GNHWK;
     using GNDHWK = ck::tensor_layout::convolution::GNDHWK;
 
     //
-    using NGCHW = ck::tensor_layout::convolution::NGCHW;
+    using NGCHW  = ck::tensor_layout::convolution::NGCHW;
+    using NGCDHW = ck::tensor_layout::convolution::NGCDHW;
 
-    using NGKHW = ck::tensor_layout::convolution::NGKHW;
+    using NGKHW  = ck::tensor_layout::convolution::NGKHW;
+    using NGKDHW = ck::tensor_layout::convolution::NGKDHW;
 
     //
     using NWGC   = ck::tensor_layout::convolution::NWGC;
@@ -302,6 +311,25 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
         {
             return profile(I2, NGCHW{}, GKYXC{}, NGKHW{}, F16{}, F16{}, F16{}, F16{}, F16{});
         }
+        else if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(I2, NGCHW{}, GKYXC{}, NGKHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
+        }
+    }
+    else if(num_dim_spatial == 2 && layout == ConvLayout::NGCHW_GKCYX_NGKHW)
+    {
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I2, NGCHW{}, GKCYX{}, NGKHW{}, F32{}, F32{}, F32{}, F32{}, F32{});
+        }
+        else if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I2, NGCHW{}, GKCYX{}, NGKHW{}, F16{}, F16{}, F16{}, F16{}, F16{});
+        }
+        else if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(I2, NGCHW{}, GKCYX{}, NGKHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
+        }
     }
     else if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
     {
@@ -340,6 +368,23 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
             return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF8{}, F8{}, F8{}, BF8{}, F8{});
         }
     }
+    // NGCDHW_GKCZYX_NGKDHW
+    else if(num_dim_spatial == 3 && layout == ConvLayout::NGCHW_GKCYX_NGKHW)
+    {
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, F32{}, F32{}, F32{}, F32{}, F32{});
+        }
+        else if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, F16{}, F16{}, F16{}, F16{}, F16{});
+        }
+        else if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(
+                I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
+        }
+    }
 
     std::cout << "this data_type & layout is not implemented" << std::endl;
 
diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh
index cdf407d6cd..0e57af7aef 100755
--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -10,7 +10,7 @@ if [ $# -ge 2 ] ; then
     shift 2
     REST_ARGS=$@
 else
-    GPU_TARGETS="gfx908;gfx90a;gfx940"
+    GPU_TARGETS="gfx908;gfx90a;gfx942"
     REST_ARGS=
 fi
 
diff --git a/script/cmake-ck-release.sh b/script/cmake-ck-release.sh
index 5e3f7faac2..95b1bebca5 100755
--- a/script/cmake-ck-release.sh
+++ b/script/cmake-ck-release.sh
@@ -10,7 +10,7 @@ if [ $# -ge 2 ] ; then
     shift 2
     REST_ARGS=$@
 else
-    GPU_TARGETS="gfx908;gfx90a;gfx940"
+    GPU_TARGETS="gfx908;gfx90a;gfx942"
     REST_ARGS=
 fi
 
diff --git a/script/convert_miopen_driver_to_profiler.py b/script/convert_miopen_driver_to_profiler.py
index 5bcaf1448d..2ddcbb67cd 100644
--- a/script/convert_miopen_driver_to_profiler.py
+++ b/script/convert_miopen_driver_to_profiler.py
@@ -19,7 +19,10 @@ def init_const_args(args):
 
 def run_ck_profiler_cmd(cmd):
     print("ckProfiler command:")
-    print(cmd)
+    cmd_concatenated_str = ""
+    for arg in cmd:
+        cmd_concatenated_str += arg + " "
+    print(cmd_concatenated_str)
     subprocess.run(cmd)
 
 
@@ -27,9 +30,10 @@ def parse_layouts(args):
     if args.in_layout == "NCW" or args.in_layout == "NCHW" or \
        args.in_layout == "NCDHW":
         if args.ck_profier_op == "grouped_conv_bwd_weight":
+            args.layout = 4
+        elif args.ck_profier_op == "grouped_conv_fwd" or \
+             args.ck_profier_op == "grouped_conv_bwd_data":
             args.layout = 3
-        elif args.ck_profier_op == "grouped_conv_fwd":
-            args.layout = 2
         else:
             print('Not supported layout for this op')
             exit(1)
@@ -122,6 +126,8 @@ def run_ck_grouped_conv_bwd_data(args):
     args.ck_profier_op = "grouped_conv_bwd_data"
     parse_data_type(args)
     parse_layouts(args)
+    # Test all split K value from the list {1, 2, 4, 8, 32, 64, 128}
+    args.split_k_value = -1
 
     cmd = [str(args.ck_profiler_cmd), str(args.ck_profier_op)]
     cmd += [str(args.data_type), str(args.layout)]
@@ -132,6 +138,7 @@ def run_ck_grouped_conv_bwd_data(args):
     cmd += [str(args.in_channels)]
     add_conv_params_to_cmd(args, cmd)
 
+    cmd += [str(args.split_k_value)]
     run_ck_profiler_cmd(cmd)
 
 
diff --git a/script/process_perf_data.py b/script/process_perf_data.py
index 0d56c9baa2..2dd54fa62d 100644
--- a/script/process_perf_data.py
+++ b/script/process_perf_data.py
@@ -344,6 +344,30 @@ def main():
             for i in range(1, len(results)+1):
                 testlist.append("Test%i"%i)
             table_name="ck_tile_gemm_mem_pipeline_fp16_tflops"
+        if 'gemm_basic_bf16' in filename:
+            for i in range(1, len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_tile_gemm_basic_bf16_tflops"
+        if 'gemm_mem_pipeline_bf16' in filename:
+            for i in range(1, len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_tile_gemm_mem_pipeline_bf16_tflops"
+        if 'gemm_basic_fp8' in filename:
+            for i in range(1, len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_tile_gemm_basic_fp8_tflops"
+        if 'gemm_mem_pipeline_fp8' in filename:
+            for i in range(1, len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_tile_gemm_mem_pipeline_fp8_tflops"
+        if 'gemm_basic_bf8' in filename:
+            for i in range(1, len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_tile_gemm_basic_bf8_tflops"
+        if 'gemm_mem_pipeline_bf8' in filename:
+            for i in range(1, len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_tile_gemm_mem_pipeline_bf8_tflops"
 
         tflops_base = get_baseline(table_name,conn)
         store_new_test_result(table_name, results, testlist, branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment, sqlEngine)
diff --git a/script/process_perf_data.sh b/script/process_perf_data.sh
index 815cf41e2d..fc44064874 100755
--- a/script/process_perf_data.sh
+++ b/script/process_perf_data.sh
@@ -43,19 +43,12 @@ file=./perf_fmha_bwd_gfx90a.log
 if [ -e "$file" ]; then
     python3 process_perf_data.py perf_fmha_bwd_gfx90a.log
 fi
-file=./perf_tile_gemm_basic_fp16_gfx942.log
-if [ -e "$file" ]; then
-    python3 process_perf_data.py perf_tile_gemm_basic_fp16_gfx942.log
-fi
-file=./perf_tile_gemm_basic_fp16_gfx90a.log
-if [ -e "$file" ]; then
-    python3 process_perf_data.py perf_tile_gemm_basic_fp16_gfx90a.log
-fi
-file=./perf_tile_gemm_mem_pipeline_fp16_gfx942.log
-if [ -e "$file" ]; then
-    python3 process_perf_data.py perf_tile_gemm_mem_pipeline_fp16_gfx942.log
-fi
-file=./perf_tile_gemm_mem_pipeline_fp16_gfx90a.log
-if [ -e "$file" ]; then
-    python3 process_perf_data.py perf_tile_gemm_mem_pipeline_fp16_gfx90a.log
-fi
+
+for gpu in "gfx90a" "gfx942"; do
+    for dtype in "fp16" "bf16" "fp8" "bf8"; do
+        file=./perf_tile_gemm_mem_pipeline_${dtype}_${gpu}.log
+        if [ -e "$file" ]; then
+            python3 process_perf_data.py perf_tile_gemm_mem_pipeline_${dtype}_${gpu}.log
+        fi
+    done
+done
diff --git a/script/process_qa_data.sh b/script/process_qa_data.sh
index c5bc1b9a1a..420453cddc 100755
--- a/script/process_qa_data.sh
+++ b/script/process_qa_data.sh
@@ -52,19 +52,12 @@ file=./perf_fmha_bwd_gfx90a.log
 if [ -e "$file" ]; then
     python3 process_perf_data.py perf_fmha_bwd_gfx90a.log
 fi
-file=./perf_gemm_basic_gfx942.log
-if [ -e "$file" ]; then
-    python3 process_perf_data.py perf_gemm_basic_gfx942.log
-fi
-file=./perf_gemm_basic_gfx90a.log
-if [ -e "$file" ]; then
-    python3 process_perf_data.py perf_gemm_basic_gfx90a.log
-fi
-file=./perf_gemm_mem_pipeline_gfx942.log
-if [ -e "$file" ]; then
-    python3 process_perf_data.py perf_gemm_mem_pipeline_gfx942.log
-fi
-file=./perf_gemm_mem_pipeline_gfx90a.log
-if [ -e "$file" ]; then
-    python3 process_perf_data.py perf_gemm_mem_pipeline_gfx90a.log
-fi
+
+for gpu in "gfx90a" "gfx942"; do
+    for dtype in "fp16" "bf16" "fp8" "bf8"; do
+        file=./perf_tile_gemm_mem_pipeline_${dtype}_${gpu}.log
+        if [ -e "$file" ]; then
+            python3 process_perf_data.py perf_tile_gemm_mem_pipeline_${dtype}_${gpu}.log
+        fi
+    done
+done
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
old mode 100644
new mode 100755
index 5de59ee5a3..72c51823be
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -15,6 +15,9 @@ set(REGRESSION_TESTS
     test_gemm_splitk
     test_batched_gemm
     test_gemm_universal
+    test_gemm_universal_streamk_fp16
+    test_gemm_universal_streamk_bf16
+    test_gemm_universal_streamk_fp8
     test_batched_gemm_softmax_gemm_fp16
     test_batched_gemm_softmax_gemm_permute_fp16
     test_batched_gemm_bias_softmax_gemm_permute_fp16
@@ -98,11 +101,11 @@ function(add_test_executable TEST_NAME)
     #only continue if there are some source files left on the list
     if(ARGN)
         if(ARGN MATCHES "_xdl")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
         elseif(ARGN MATCHES "_wmma")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx950)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx950)
         elseif(ARGN MATCHES "_smfmac")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx908 gfx90a gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
         endif()
         set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
         add_executable(${TEST_NAME} ${ARGN})
@@ -194,13 +197,13 @@ function(add_gtest_executable TEST_NAME)
     #only continue if there are some source files left on the list
     if(ARGN)
         if(ARGN MATCHES "_xdl")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
         elseif(ARGN MATCHES "_wmma")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx950)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx950)
         elseif(ARGN MATCHES "_smfmac")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx908 gfx90a gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
         elseif(ARGN MATCHES "_mx") #only build mx example for gfx950
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
         endif()
         set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
         add_executable(${TEST_NAME} ${ARGN})
@@ -239,6 +242,7 @@ add_subdirectory(gemm_add)
 add_subdirectory(gemm_layernorm)
 add_subdirectory(gemm_split_k)
 add_subdirectory(gemm_universal)
+add_subdirectory(gemm_universal_streamk)
 add_subdirectory(gemm_reduce)
 add_subdirectory(batched_gemm)
 add_subdirectory(batched_gemm_reduce)
@@ -275,6 +279,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx942" OR SUPPORTED_GPU_TARGETS MATCHES "gfx9
 endif()
 if(SUPPORTED_GPU_TARGETS MATCHES "gfx950") 
     add_subdirectory(mx_mfma_op)
+    add_subdirectory(gemm_mx)
 endif()
 add_subdirectory(position_embedding)
 add_subdirectory(scatter_gather)
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16_xdl.cpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16_xdl.cpp
index 8e0baede11..8136257a24 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16_xdl.cpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16_xdl.cpp
@@ -102,7 +102,6 @@ TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Bench_BF16_Irregul
                                                    {256, 64, 160, 64, 1, 16},
                                                    {1024, 1024, 80, 80, 1, 16},
                                                    {1024, 64, 80, 64, 1, 16},
-                                                   {4096, 4096, 40, 40, 1, 16},
                                                    {4096, 64, 40, 64, 1, 16}};
     this->bench_   = true;
     this->verify_  = false;
@@ -118,10 +117,6 @@ TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Bench_BF16)
         {512, 512, 128, 128, 48, 16},
         {1024, 1024, 64, 64, 48, 16},
         {1024, 1024, 128, 128, 48, 16},
-        {2048, 2048, 64, 64, 48, 16},
-        {2048, 2048, 128, 128, 48, 16},
-        {4096, 4096, 64, 64, 48, 16},
-        {4096, 4096, 128, 128, 48, 16},
     };
     this->bench_  = true;
     this->verify_ = false;
diff --git a/test/ck_tile/batched_gemm/test_batched_gemm_ut_cases.inc b/test/ck_tile/batched_gemm/test_batched_gemm_ut_cases.inc
index f261164d61..74338ba383 100644
--- a/test/ck_tile/batched_gemm/test_batched_gemm_ut_cases.inc
+++ b/test/ck_tile/batched_gemm/test_batched_gemm_ut_cases.inc
@@ -3,7 +3,7 @@
 TYPED_TEST(TestCkTileBatchedGemm, Basic)
 {
     constexpr int M = 256;
-    constexpr int N = 128;
-    constexpr int K = 128;
+    constexpr int N = 256;
+    constexpr int K = 512;
     this->Run(M, N, K);
 }
diff --git a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
index 5d0929f0e4..0af3ef3b34 100644
--- a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
+++ b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
@@ -28,17 +28,9 @@ class TestCkTileBatchedGemm : public ::testing::Test
     void invoke_batched_gemm(const ck_tile::BatchedGemmHostArgs& args,
                              const ck_tile::stream_config& s)
     {
-        // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
-        constexpr bool kPadM = false;
-        constexpr bool kPadN = false;
-        constexpr bool kPadK = false;
-
-        constexpr int kBlockPerCu = 1;
-
-        // This part comes from the Codegen
-        constexpr ck_tile::index_t M_Tile = 128;
-        constexpr ck_tile::index_t N_Tile = 128;
-        constexpr ck_tile::index_t K_Tile = 32;
+        constexpr ck_tile::index_t M_Tile = 256;
+        constexpr ck_tile::index_t N_Tile = 256;
+        constexpr ck_tile::index_t K_Tile = 64;
 
         constexpr ck_tile::index_t M_Warp = 2;
         constexpr ck_tile::index_t N_Warp = 2;
@@ -46,70 +38,144 @@ class TestCkTileBatchedGemm : public ::testing::Test
 
         constexpr ck_tile::index_t M_Warp_Tile = 32;
         constexpr ck_tile::index_t N_Warp_Tile = 32;
-        constexpr ck_tile::index_t K_Warp_Tile = 8;
+        constexpr ck_tile::index_t K_Warp_Tile = 16;
 
-        using CodegenGemmShape =
+        constexpr bool DoubleSmemBuffer = false;
+
+        constexpr bool kPadM = false;
+        constexpr bool kPadN = false;
+        constexpr bool kPadK = false;
+
+        constexpr bool TransposeC = false;
+
+        constexpr int kBlockPerCu                         = 1;
+        constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+        constexpr ck_tile::index_t TileParitionerM01      = 4;
+
+        using GemmShape =
             ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
                                    ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
                                    ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+        using TilePartitioner = ck_tile::
+            GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
 
-        using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
+        using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+        using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<kPadM,
+                                                                     kPadN,
+                                                                     kPadK,
+                                                                     DoubleSmemBuffer,
+                                                                     ALayout,
+                                                                     BLayout,
+                                                                     CLayout,
+                                                                     TransposeC>;
+        using GemmPipelineProblem =
+            ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
 
-        using CodegenGemmTraits =
-            ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+        using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<GemmPipelineProblem>;
 
-        using CodegenPipelineProblem = ck_tile::GemmPipelineProblem<ADataType,
-                                                                    BDataType,
-                                                                    AccDataType,
-                                                                    CodegenGemmShape,
-                                                                    CodegenGemmTraits>;
+        const ck_tile::index_t k_grain     = args.k_batch * K_Tile;
+        const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * K_Tile;
+        const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+        const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
 
-        using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+        float ave_time{0};
 
-        using GemmEpilogue = ck_tile::CShuffleEpilogue<
-            ck_tile::CShuffleEpilogueProblem<AccDataType,
-                                             CDataType,
-                                             CLayout,
-                                             CodegenGemmPipeline::BlockSize,
-                                             TilePartitioner::MPerBlock,
-                                             TilePartitioner::NPerBlock,
-                                             M_Warp,
-                                             N_Warp,
-                                             M_Warp_Tile,
-                                             N_Warp_Tile,
-                                             K_Warp_Tile,
-                                             CodegenPipelineProblem::TransposeC>>;
-        using Kernel =
-            ck_tile::BatchedGemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
+        const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
+            constexpr bool has_hot_loop_v = has_hot_loop_.value;
+            constexpr auto tail_number_v  = tail_number_.value;
+            constexpr auto scheduler      = ck_tile::GemmPipelineScheduler::Intrawave;
 
-        auto kargs = Kernel::MakeKernelArgs(args);
+            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                               BDataType,
+                                                                               AccDataType,
+                                                                               GemmShape,
+                                                                               GemmUniversalTraits,
+                                                                               scheduler,
+                                                                               has_hot_loop_v,
+                                                                               tail_number_v>;
 
-        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch, args.batch_count);
-        constexpr dim3 blocks = Kernel::BlockSize();
+            using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<UniversalGemmProblem>;
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                 BDataType,
+                                                 AccDataType,
+                                                 CDataType,
+                                                 CLayout,
+                                                 GemmPipelineProblem::kBlockSize,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 M_Warp,
+                                                 N_Warp,
+                                                 M_Warp_Tile,
+                                                 N_Warp_Tile,
+                                                 K_Warp_Tile,
+                                                 UniversalGemmProblem::TransposeC>>;
+            using Kernel = ck_tile::BatchedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
 
-        if(s.log_level_ > 0)
+            const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch, args.batch_count);
+            constexpr dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+            }
+
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                          << "shape: " << GemmShape::GetName() << '\n'
+                          << "problem: " << GemmPipelineProblem::GetName() << '\n'
+                          << "pipeline: " << GemmPipeline::GetName() << '\n'
+                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << std::endl;
+            }
+
+            ave_time = ck_tile::launch_kernel(
+                s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+            return ave_time;
+        };
+
+        if(has_hot_loop)
         {
-            std::cout << "Launching kernel with args:"
-                      << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << std::endl;
+            if(tail_num == ck_tile::TailNumber::Full)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+            }
+            else
+            {
+                std::ostringstream err;
+                err << "For compute pipeline tail number should always be Full, but have \""
+                    << tail_num << "\" which is not supported! PrefetchStages: "
+                    << BaseGemmPipeline::PrefetchStages << "\n File: " << __FILE__ << ":"
+                    << __LINE__ << ", in function: " << __func__;
+                throw std::runtime_error(err.str());
+            }
+        }
+        else
+        {
+            std::ostringstream err;
+            err << "Num K loop must be larger than number of prefetech stages."
+                << "\n PrefetchStages: " << BaseGemmPipeline::PrefetchStages
+                << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
+            throw std::runtime_error(err.str());
         }
-
-        ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
     }
 
     public:
     void Run(const int M,
              const int N,
              const int K,
-             int StrideA            = 128,
-             int StrideB            = 128,
-             int StrideC            = 128,
-             const int BatchStrideA = 32768,
-             const int BatchStrideB = 16384,
-             const int BatchStrideC = 32768,
-             const int BatchCount   = 16)
+             int StrideA            = 512,
+             int StrideB            = 512,
+             int StrideC            = 256,
+             const int BatchStrideA = 131072,
+             const int BatchStrideB = 131072,
+             const int BatchStrideC = 65536,
+             const int BatchCount   = 8)
     {
         using namespace ck_tile::literals;
 
diff --git a/test/ck_tile/gemm/CMakeLists.txt b/test/ck_tile/gemm/CMakeLists.txt
index ecfbd4e55b..3e7296b1eb 100644
--- a/test/ck_tile/gemm/CMakeLists.txt
+++ b/test/ck_tile/gemm/CMakeLists.txt
@@ -1,4 +1,28 @@
-# Currently ck_tile is only built on gfx9
-if(GPU_TARGETS MATCHES "gfx9")
-    add_gtest_executable(test_ck_tile_gemm_pipeline test_gemm_pipeline.cpp)
+# Currently ck_tile is only built on gfx94/gfx95
+set(EXAMPLE_GEMM_COMPILE_OPTIONS "")
+if(CK_USE_OCP_FP8)
+    list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
+endif()
+set(EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS "")
+if(CK_USE_OCP_FP8)
+    list(APPEND EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS -DCK_TILE_USE_OCP_FP8)
+endif()
+list(APPEND EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS
+    -mllvm
+    -enable-noalias-to-md-conversion=0
+)
+
+if(CK_USE_OCP_FP8)
+    list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
+if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
+    add_gtest_executable(test_ck_tile_gemm_pipeline_mem test_gemm_pipeline_mem.cpp)
+    add_gtest_executable(test_ck_tile_gemm_pipeline_compv3 test_gemm_pipeline_compv3.cpp)
+    add_gtest_executable(test_ck_tile_gemm_pipeline_compv4 test_gemm_pipeline_compv4.cpp)
+
+    target_compile_options(test_ck_tile_gemm_pipeline_mem PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+    target_compile_options(test_ck_tile_gemm_pipeline_compv3 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+    target_compile_options(test_ck_tile_gemm_pipeline_compv4 PRIVATE ${EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS})
+else()
+    message("Skipping ck_tile_gemm tests for current target")
+endif()
 endif()
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_compv3.cpp b/test/ck_tile/gemm/test_gemm_pipeline_compv3.cpp
new file mode 100644
index 0000000000..8944e6865d
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_compv3.cpp
@@ -0,0 +1,16 @@
+#include "test_gemm_pipeline_kernel_types.hpp"
+#include "test_gemm_pipeline_util.hpp"
+#include "gtest/gtest.h"
+
+template <typename T>
+class TestCkTileGemmPipelineCompV3 : public TestCkTileGemmPipeline<T>
+{
+};
+
+#define TEST_SUITE_NAME TestCkTileGemmPipelineCompV3
+
+TYPED_TEST_SUITE(TestCkTileGemmPipelineCompV3, KernelTypesCompV3);
+
+#include "test_gemm_pipeline_ut_cases.inc"
+
+#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_compv4.cpp b/test/ck_tile/gemm/test_gemm_pipeline_compv4.cpp
new file mode 100644
index 0000000000..22e77fac41
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_compv4.cpp
@@ -0,0 +1,16 @@
+#include "test_gemm_pipeline_kernel_types.hpp"
+#include "test_gemm_pipeline_util.hpp"
+#include "gtest/gtest.h"
+
+template <typename T>
+class TestCkTileGemmPipelineCompV4 : public TestCkTileGemmPipeline<T>
+{
+};
+
+#define TEST_SUITE_NAME TestCkTileGemmPipelineCompV4
+
+TYPED_TEST_SUITE(TestCkTileGemmPipelineCompV4, KernelTypesCompV4);
+
+#include "test_gemm_pipeline_ut_cases.inc"
+
+#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm/test_gemm_pipeline.cpp b/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
similarity index 62%
rename from test/ck_tile/gemm/test_gemm_pipeline.cpp
rename to test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
index f0236b5d88..bd1502516b 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
@@ -10,6 +10,7 @@
 
 using F16       = ck_tile::half_t;
 using F32       = float;
+using F8        = ck_tile::fp8_t;
 using Row       = ck_tile::tensor_layout::gemm::RowMajor;
 using Col       = ck_tile::tensor_layout::gemm::ColumnMajor;
 using Intrawave = ck_tile::integral_constant<ck_tile::GemmPipelineScheduler,
@@ -21,27 +22,41 @@ using CompV3    = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType:
 using CompV4    = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::CompV4>;
 
 // clang-format off
-using KernelTypes = ::testing::Types<
-    //         ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType, GemmPipelineScheduler, PipelineType
+using KernelTypesMem = ::testing::Types<
     std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,         Mem>,
-    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3>,
-    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV4>,
     std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Interwave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       F8,       F8,         F32,       F16,             Interwave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       F8,       F8,         F32,       F16,             Intrawave,         Mem>,
     std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,         Mem>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV4>,
     std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Interwave,         Mem>,
+    std::tuple<    Row,     Col,     Row,       F8,       F8,         F32,       F16,             Interwave,         Mem>,
+    std::tuple<    Row,     Col,     Row,       F8,       F8,         F32,       F16,             Intrawave,         Mem>,
     std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,         Mem>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV4>,
     std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,             Interwave,         Mem>,
+    std::tuple<    Col,     Row,     Row,       F8,       F8,         F32,       F16,             Intrawave,         Mem>,
+    std::tuple<    Col,     Row,     Row,       F8,       F8,         F32,       F16,             Interwave,         Mem>,
     std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,         Mem>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Interwave,         Mem>,
+    std::tuple<    Col,     Col,     Row,       F8,       F8,         F32,       F16,             Intrawave,         Mem>,
+    std::tuple<    Col,     Col,     Row,       F8,       F8,         F32,       F16,             Interwave,         Mem>
+>;
+
+using KernelTypesCompV3 = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       F8,       F8,         F32,       F16,             Intrawave,         CompV3>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       F8,       F8,         F32,       F16,             Intrawave,         CompV3>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       F8,       F8,         F32,       F16,             Intrawave,         CompV3>,
     std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV4>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Interwave,         Mem>
-    >;
+    std::tuple<    Col,     Col,     Row,       F8,       F8,         F32,       F16,             Intrawave,        CompV3>
+>;
+
+using KernelTypesCompV4 = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV4>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV4>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV4>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV4>
+>;
+
 // clang-format on
-
-TYPED_TEST_SUITE(TestCkTileGemmPipeline, KernelTypes);
-
-#include "test_gemm_pipeline_ut_cases.inc"
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_mem.cpp b/test/ck_tile/gemm/test_gemm_pipeline_mem.cpp
new file mode 100644
index 0000000000..a7f4e68386
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_mem.cpp
@@ -0,0 +1,16 @@
+#include "test_gemm_pipeline_kernel_types.hpp"
+#include "test_gemm_pipeline_util.hpp"
+#include "gtest/gtest.h"
+
+template <typename T>
+class TestCkTileGemmPipelineMem : public TestCkTileGemmPipeline<T>
+{
+};
+
+#define TEST_SUITE_NAME TestCkTileGemmPipelineMem
+
+TYPED_TEST_SUITE(TestCkTileGemmPipelineMem, KernelTypesMem);
+
+#include "test_gemm_pipeline_ut_cases.inc"
+
+#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc b/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc
index e53015a975..1f0683f8b8 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc
+++ b/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc
@@ -3,7 +3,10 @@
 
 #pragma once
 
-TYPED_TEST(TestCkTileGemmPipeline, SmallM)
+#ifndef TEST_GEMM_PIPELINE_UT_CASES_INC
+#define TEST_GEMM_PIPELINE_UT_CASES_INC
+
+TYPED_TEST(TEST_SUITE_NAME, SmallM)
 {
     std::vector<int> Ms{1, 2, 3, 4, 5, 6};
     constexpr int N = 1024;
@@ -13,18 +16,25 @@ TYPED_TEST(TestCkTileGemmPipeline, SmallM)
     {
         if constexpr(std::is_same_v<typename TestFixture::ALayout,
                                     ck_tile::tensor_layout::gemm::ColumnMajor>)
+        {
             EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
+        }
         else
+        {
             this->Run(M, N, K);
+        }
     }
 }
 
-TYPED_TEST(TestCkTileGemmPipeline, MidLargeM)
+TYPED_TEST(TEST_SUITE_NAME, MidLargeM)
 {
     std::vector<int> Ms{127, 255, 312, 799, 1573};
     constexpr int N           = 1024;
     constexpr int K           = 320;
-    constexpr int VecLoadSize = 8;
+    constexpr int VecLoadSize = (std::is_same_v<typename TestFixture::ADataType, ck_tile::fp8_t> ||
+                                 std::is_same_v<typename TestFixture::ADataType, ck_tile::bf8_t>)
+                                    ? 16
+                                    : 8;
 
     for(int M : Ms)
     {
@@ -33,9 +43,13 @@ TYPED_TEST(TestCkTileGemmPipeline, MidLargeM)
         {
             // TODO: Can we anyhow deduce used vector load size?
             if(M % VecLoadSize == 0)
+            {
                 this->Run(M, N, K);
+            }
             else
+            {
                 EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
+            }
         }
         else
         {
@@ -44,7 +58,7 @@ TYPED_TEST(TestCkTileGemmPipeline, MidLargeM)
     }
 }
 
-TYPED_TEST(TestCkTileGemmPipeline, PaddK)
+TYPED_TEST(TEST_SUITE_NAME, PaddK)
 {
     std::vector<int> Ms{128};
     constexpr int N = 1024;
@@ -54,7 +68,7 @@ TYPED_TEST(TestCkTileGemmPipeline, PaddK)
         this->Run(M, N, K);
 }
 
-TYPED_TEST(TestCkTileGemmPipeline, Regular)
+TYPED_TEST(TEST_SUITE_NAME, Regular)
 {
     std::vector<int> Ms{512};
     constexpr int N = 1024;
@@ -64,7 +78,16 @@ TYPED_TEST(TestCkTileGemmPipeline, Regular)
         this->Run(M, N, K);
 }
 
-TYPED_TEST(TestCkTileGemmPipeline, NotSupportedArgument)
+TYPED_TEST(TEST_SUITE_NAME, LargeMatrix)
+{
+    constexpr int M = 2048;
+    constexpr int N = 2048;
+    constexpr int K = 2048;
+
+    this->Run(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, NotSupportedArgument)
 {
     constexpr int M = 512;
     constexpr int N = 1025;
@@ -76,3 +99,5 @@ TYPED_TEST(TestCkTileGemmPipeline, NotSupportedArgument)
 
     EXPECT_THROW((this->template Run<PadM, PadN, PadK>(M, N, K)), std::runtime_error);
 }
+
+#endif
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
index 155234cddc..1b997ddbce 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
@@ -11,6 +11,27 @@
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/gemm.hpp"
 
+template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeType =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
+    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
+        max_accumulated_value, kbatch);
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
 enum struct GemmPipelineType
 {
     Mem,
@@ -63,7 +84,7 @@ class TestCkTileGemmPipeline : public ::testing::Test
         // TODO: This should be parameterized in tests
         constexpr ck_tile::index_t M_Tile = 256;
         constexpr ck_tile::index_t N_Tile = 256;
-        constexpr ck_tile::index_t K_Tile = 32;
+        constexpr ck_tile::index_t K_Tile = (PipelineType == GemmPipelineType::CompV4) ? 32 : 64;
 
         constexpr ck_tile::index_t M_Warp = 2;
         constexpr ck_tile::index_t N_Warp = 2;
@@ -71,7 +92,7 @@ class TestCkTileGemmPipeline : public ::testing::Test
 
         constexpr ck_tile::index_t M_Warp_Tile = 32;
         constexpr ck_tile::index_t N_Warp_Tile = 32;
-        constexpr ck_tile::index_t K_Warp_Tile = 8;
+        constexpr ck_tile::index_t K_Warp_Tile = 16;
 
         constexpr bool kPadM = PadM;
         constexpr bool kPadN = PadN;
@@ -134,7 +155,9 @@ class TestCkTileGemmPipeline : public ::testing::Test
                 typename GemmPipelineTypeSelector<PipelineType, UniversalGemmProblem>::pipeline;
 
             using GemmEpilogue = ck_tile::CShuffleEpilogue<
-                ck_tile::CShuffleEpilogueProblem<AccDataType,
+                ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                 BDataType,
+                                                 AccDataType,
                                                  CDataType,
                                                  CLayout,
                                                  GemmPipeline::BlockSize,
@@ -418,7 +441,18 @@ class TestCkTileGemmPipeline : public ::testing::Test
         ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
             a_m_k, b_k_n, c_m_n_host_ref);
 
-        pass = ck_tile::check_err(c_m_n_dev_result, c_m_n_host_ref);
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(c_m_n_dev_result,
+                                  c_m_n_host_ref,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
         EXPECT_TRUE(pass);
     }
 };
diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm_ut_cases.inc b/test/ck_tile/grouped_gemm/test_grouped_gemm_ut_cases.inc
index 68c4693bb3..9f6b66c92b 100644
--- a/test/ck_tile/grouped_gemm/test_grouped_gemm_ut_cases.inc
+++ b/test/ck_tile/grouped_gemm/test_grouped_gemm_ut_cases.inc
@@ -2,7 +2,7 @@
 
 TYPED_TEST(TestCkTileGroupedGemm, Basic)
 {
-    const int group_count = 16;
+    const int group_count = 8;
     std::vector<int> Ms;
     std::vector<int> Ns;
     std::vector<int> Ks;
@@ -13,8 +13,8 @@ TYPED_TEST(TestCkTileGroupedGemm, Basic)
     for(int i = 0; i < group_count; i++)
     {
         Ms.push_back(256 + 256 * i);
-        Ns.push_back(128 + 128 * i);
-        Ks.push_back(128 + 64 * i);
+        Ns.push_back(256 + 512 * i);
+        Ks.push_back(256 + 64 * i);
 
         stride_As.push_back(Ks[i]);
         stride_Bs.push_back(Ks[i]);
diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
index 6b9bf0c6f7..b125d19762 100644
--- a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
+++ b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
@@ -44,63 +44,10 @@ class TestCkTileGroupedGemm : public ::testing::Test
         static const ck_tile::index_t K_Warp_Tile = 8;
     };
 
-    using CodegenGemmShape =
-        ck_tile::TileGemmShape<ck_tile::sequence<GroupedGemKernelParam::M_Tile,
-                                                 GroupedGemKernelParam::N_Tile,
-                                                 GroupedGemKernelParam::K_Tile>,
-                               ck_tile::sequence<GroupedGemKernelParam::M_Warp,
-                                                 GroupedGemKernelParam::N_Warp,
-                                                 GroupedGemKernelParam::K_Warp>,
-                               ck_tile::sequence<GroupedGemKernelParam::M_Warp_Tile,
-                                                 GroupedGemKernelParam::N_Warp_Tile,
-                                                 GroupedGemKernelParam::K_Warp_Tile>>;
-
-    using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
-
-    template <typename ALayout, typename BLayout, typename CLayout>
-    using CodegenGemmTraits = ck_tile::TileGemmTraits<GroupedGemKernelParam::kPadM,
-                                                      GroupedGemKernelParam::kPadN,
-                                                      GroupedGemKernelParam::kPadK,
-                                                      ALayout,
-                                                      BLayout,
-                                                      CLayout>;
-
-    template <typename ALayout, typename BLayout, typename CLayout>
-    using CodegenPipelineProblem =
-        ck_tile::GemmPipelineProblem<ADataType,
-                                     BDataType,
-                                     AccDataType,
-                                     CodegenGemmShape,
-                                     CodegenGemmTraits<ALayout, BLayout, CLayout>>;
-
-    template <typename ALayout, typename BLayout, typename CLayout>
-    using CodegenGemmPipeline =
-        ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem<ALayout, BLayout, CLayout>>;
-
-    template <typename ALayout, typename BLayout, typename CLayout>
-    using GemmEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
-        AccDataType,
-        CDataType,
-        CLayout,
-        CodegenGemmPipeline<ALayout, BLayout, CLayout>::BlockSize,
-        TilePartitioner::MPerBlock,
-        TilePartitioner::NPerBlock,
-        GroupedGemKernelParam::M_Warp,
-        GroupedGemKernelParam::N_Warp,
-        GroupedGemKernelParam::M_Warp_Tile,
-        GroupedGemKernelParam::N_Warp_Tile,
-        GroupedGemKernelParam::K_Warp_Tile,
-        CodegenPipelineProblem<ALayout, BLayout, CLayout>::TransposeC>>;
-
-    template <typename ALayout, typename BLayout, typename CLayout>
-    using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner,
-                                              CodegenGemmPipeline<ALayout, BLayout, CLayout>,
-                                              GemmEpilogue<ALayout, BLayout, CLayout>>;
-
-    using grouped_gemm_kargs = ck_tile::GroupedGemmHostArgs;
-    std::size_t GetWorkspaceSize(const std::vector<grouped_gemm_kargs>& gemm_descs)
+    using grouped_gemm_kargs = ck_tile::GemmHostArgs;
+    std::size_t get_workspace_size(const std::vector<grouped_gemm_kargs>& gemm_descs)
     {
-        return Kernel<std::nullptr_t, std::nullptr_t, std::nullptr_t>::GetWorkSpaceSize(gemm_descs);
+        return gemm_descs.size() * sizeof(ck_tile::GemmTransKernelArg);
     }
 
     template <typename ALayout, typename BLayout, typename CLayout>
@@ -108,35 +55,140 @@ class TestCkTileGroupedGemm : public ::testing::Test
                              const ck_tile::stream_config& s,
                              void* p_workspace_)
     {
-        using GroupedGemmKernel = Kernel<ALayout, BLayout, CLayout>;
+        constexpr bool DoubleSmemBuffer = false;
+        constexpr bool TransposeC       = false;
 
-        auto arguments = GroupedGemmKernel::MakeKargs(gemm_descs);
+        constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+        constexpr ck_tile::index_t TileParitionerM01      = 4;
 
-        const dim3 grids      = GroupedGemmKernel::GridSize(gemm_descs);
-        constexpr dim3 blocks = GroupedGemmKernel::BlockSize();
+        using GemmShape =
+            ck_tile::TileGemmShape<ck_tile::sequence<GroupedGemKernelParam::M_Tile,
+                                                     GroupedGemKernelParam::N_Tile,
+                                                     GroupedGemKernelParam::K_Tile>,
+                                   ck_tile::sequence<GroupedGemKernelParam::M_Warp,
+                                                     GroupedGemKernelParam::N_Warp,
+                                                     GroupedGemKernelParam::K_Warp>,
+                                   ck_tile::sequence<GroupedGemKernelParam::M_Warp_Tile,
+                                                     GroupedGemKernelParam::N_Warp_Tile,
+                                                     GroupedGemKernelParam::K_Warp_Tile>>;
+        using TilePartitioner = ck_tile::
+            GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
 
-        ck_tile::hip_check_error(hipMemcpyWithStream(
-            p_workspace_,
-            arguments.data(),
-            arguments.size() * sizeof(typename GroupedGemmKernel::GemmTransKernelArg),
-            hipMemcpyHostToDevice,
-            s.stream_id_));
+        using Traits              = ck_tile::TileGemmTraits<GroupedGemKernelParam::kPadM,
+                                               GroupedGemKernelParam::kPadN,
+                                               GroupedGemKernelParam::kPadK,
+                                               ALayout,
+                                               BLayout,
+                                               CLayout>;
+        using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GroupedGemKernelParam::kPadM,
+                                                                     GroupedGemKernelParam::kPadN,
+                                                                     GroupedGemKernelParam::kPadK,
+                                                                     DoubleSmemBuffer,
+                                                                     ALayout,
+                                                                     BLayout,
+                                                                     CLayout,
+                                                                     TransposeC>;
+        using GemmPipelineProblem =
+            ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
 
-        if(s.log_level_ > 0)
+        using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<GemmPipelineProblem>;
+
+        const ck_tile::index_t k_grain = gemm_descs[0].k_batch * GroupedGemKernelParam::K_Tile;
+        const ck_tile::index_t K_split =
+            (gemm_descs[0].K + k_grain - 1) / k_grain * GroupedGemKernelParam::K_Tile;
+        const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+        const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+
+        float ave_time{0};
+
+        const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
+            constexpr bool has_hot_loop_v = has_hot_loop_.value;
+            constexpr auto tail_number_v  = tail_number_.value;
+            constexpr auto scheduler      = ck_tile::GemmPipelineScheduler::Intrawave;
+
+            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                               BDataType,
+                                                                               AccDataType,
+                                                                               GemmShape,
+                                                                               GemmUniversalTraits,
+                                                                               scheduler,
+                                                                               has_hot_loop_v,
+                                                                               tail_number_v>;
+
+            using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<UniversalGemmProblem>;
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                 BDataType,
+                                                 AccDataType,
+                                                 CDataType,
+                                                 CLayout,
+                                                 GemmPipelineProblem::kBlockSize,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 GroupedGemKernelParam::M_Warp,
+                                                 GroupedGemKernelParam::N_Warp,
+                                                 GroupedGemKernelParam::M_Warp_Tile,
+                                                 GroupedGemKernelParam::N_Warp_Tile,
+                                                 GroupedGemKernelParam::K_Warp_Tile,
+                                                 UniversalGemmProblem::TransposeC>>;
+            using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            auto kargs   = Kernel::MakeKargs(gemm_descs);
+
+            const dim3 grids      = Kernel::GridSize(gemm_descs);
+            constexpr dim3 blocks = Kernel::BlockSize();
+
+            ck_tile::hip_check_error(hipMemcpyWithStream(p_workspace_,
+                                                         kargs.data(),
+                                                         get_workspace_size(gemm_descs),
+                                                         hipMemcpyHostToDevice,
+                                                         s.stream_id_));
+
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel: " << Kernel::GetName() << " with args:"
+                          << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << std::endl;
+            }
+
+            ave_time = ck_tile::launch_kernel(
+                s,
+                ck_tile::make_kernel<blocks.x, GroupedGemKernelParam::kBlockPerCu>(
+                    Kernel{},
+                    grids,
+                    blocks,
+                    0,
+                    ck_tile::cast_pointer_to_constant_address_space(p_workspace_),
+                    gemm_descs.size()));
+            return ave_time;
+        };
+
+        if(has_hot_loop)
         {
-            std::cout << "Launching kernel with args:"
-                      << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << std::endl;
+            if(tail_num == ck_tile::TailNumber::Full)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+            }
+            else
+            {
+                std::ostringstream err;
+                err << "For compute pipeline tail number should always be Full, but have \""
+                    << tail_num << "\" which is not supported! PrefetchStages: "
+                    << BaseGemmPipeline::PrefetchStages << "\n File: " << __FILE__ << ":"
+                    << __LINE__ << ", in function: " << __func__;
+                throw std::runtime_error(err.str());
+            }
+        }
+        else
+        {
+            std::ostringstream err;
+            err << "Num K loop must be larger than number of prefetech stages."
+                << "\n PrefetchStages: " << BaseGemmPipeline::PrefetchStages
+                << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
+            throw std::runtime_error(err.str());
         }
-        ck_tile::launch_kernel(s,
-                               ck_tile::make_kernel<blocks.x, GroupedGemKernelParam::kBlockPerCu>(
-                                   GroupedGemmKernel{},
-                                   grids,
-                                   blocks,
-                                   0,
-                                   ck_tile::cast_pointer_to_constant_address_space(p_workspace_),
-                                   gemm_descs.size()));
     }
 
     public:
@@ -241,12 +293,14 @@ class TestCkTileGroupedGemm : public ::testing::Test
             const void* p_b = b_k_n_dev_buf[i]->GetDeviceBuffer();
             void* p_c       = c_m_n_dev_buf[i]->GetDeviceBuffer();
 
+            // TODO add support for kbatch > 1
+            static constexpr ck_tile::index_t k_batch = 1;
             gemm_descs.push_back(
-                {p_a, p_b, p_c, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]});
+                {p_a, p_b, p_c, k_batch, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]});
         }
 
         ck_tile::DeviceMem gemm_workspace;
-        gemm_workspace.Realloc(GetWorkspaceSize(gemm_descs));
+        gemm_workspace.Realloc(get_workspace_size(gemm_descs));
 
         invoke_grouped_gemm<ALayout, BLayout, CLayout>(
             gemm_descs, ck_tile::stream_config{nullptr, false}, gemm_workspace.GetDeviceBuffer());
diff --git a/test/data_type/test_bf8_ocp.cpp b/test/data_type/test_bf8_ocp.cpp
index 9d4ee38b15..285e7e69fc 100644
--- a/test/data_type/test_bf8_ocp.cpp
+++ b/test/data_type/test_bf8_ocp.cpp
@@ -1,13 +1,19 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
+#include "ck/library/utility/device_memory.hpp"
 #include "ck/utility/data_type.hpp"
 #include "ck/utility/type_convert.hpp"
 
 using ck::bf8_ocp_t;
+using ck::bf8x2_ocp_t;
+using ck::bhalf2_t;
+using ck::bhalf_t;
 using ck::f8_convert_rne;
 using ck::f8_convert_sr;
+using ck::float2_t;
+using ck::half2_t;
 using ck::half_t;
 using ck::type_convert;
 
@@ -266,3 +272,590 @@ TEST(BF8OCP, ConvertFP16Stochastic)
     const auto bf8_nan = f8_convert_sr<bf8_ocp_t>(ck::NumericLimits<half_t>::QuietNaN());
     ASSERT_TRUE(ck::fp8_impl::ocp_bf8_is_nan(bf8_nan.data));
 }
+
+constexpr uint64_t test_size = 256 + 6;
+
+__host__ __device__ void
+test_fp32_bf8_type_convert(uint64_t N, float* p_test, uint64_t* p_completed)
+{
+    if(p_completed == nullptr)
+    {
+        return;
+    }
+
+    uint64_t& i = *p_completed;
+    i           = 0;
+
+    if(p_test == nullptr)
+    {
+        return;
+    }
+
+    for(ck::index_t bf8_id = 0; bf8_id < 256; bf8_id++)
+    {
+        uint8_t bf8_uid = static_cast<uint8_t>(bf8_id);
+        auto v          = type_convert<float>(bf8_ocp_t{bf8_uid});
+        p_test[i]       = v;
+        i++;
+        if(i >= N)
+        {
+            return;
+        }
+    }
+
+    /// Test vector conversion
+    // bf8x2 -> fp32x2
+    bf8x2_ocp_t bf8x2{bf8x2_ocp_t::data_v{0b10000100, 0b00000001}}; //-2^-14, 2^-16
+
+    float2_t f32x2 = type_convert<float2_t>(bf8x2);
+    p_test[i++]    = f32x2[0];
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = f32x2[1];
+    if(i >= N)
+    {
+        return;
+    }
+
+    // fp32x2 -> bf8x2
+    f32x2 = {-4.0f, 2.0f};
+    bf8x2 = f8_convert_rne<bf8x2_ocp_t>(f32x2); // expect {-4, 2}
+
+    p_test[i++] = type_convert<float>(bf8x2.AsType<bf8_ocp_t>()(ck::Number<0>{})); //-4f
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = type_convert<float>(bf8x2.AsType<bf8_ocp_t>()(ck::Number<1>{})); // 2f
+    if(i >= N)
+    {
+        return;
+    }
+
+    bf8x2 = f8_convert_sr<bf8x2_ocp_t>(f32x2); // expect {-4, 2}
+
+    p_test[i++] = type_convert<float>(bf8x2.AsType<bf8_ocp_t>()(ck::Number<0>{})); //-4f
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = type_convert<float>(bf8x2.AsType<bf8_ocp_t>()(ck::Number<1>{})); // 2f
+    if(i >= N)
+    {
+        return;
+    }
+}
+
+TEST(BF8OCP, HostFP32BF8Convert)
+{
+    std::vector<float> out(test_size, -1.0f);
+    uint64_t completed = 0;
+
+    test_fp32_bf8_type_convert(test_size, out.data(), &completed);
+
+    std::set<uint8_t> bf8_nan_ids;
+    bf8_nan_ids.insert(0b11111111);
+    bf8_nan_ids.insert(0b01111111);
+    bf8_nan_ids.insert(0b11111101);
+    bf8_nan_ids.insert(0b01111101);
+    bf8_nan_ids.insert(0b11111110);
+    bf8_nan_ids.insert(0b01111110);
+    for(auto bf8_nan_id : bf8_nan_ids)
+    {
+        auto idx = bf8_nan_id;
+        ASSERT_TRUE(std::isnan(out[idx]));
+    }
+
+    for(ck::index_t bf8_id = 0; bf8_id < 256; bf8_id++)
+    {
+        if(bf8_nan_ids.find(bf8_id) != bf8_nan_ids.end())
+            continue;
+
+        uint8_t bf8_uid = static_cast<uint8_t>(bf8_id);
+        auto idx        = bf8_uid;
+        ASSERT_FLOAT_EQ(out[idx], type_convert<float>(bf8_ocp_t{bf8_uid}))
+            << " bf8_id: " << bf8_id << std::endl
+            << type_convert<float>(bf8_ocp_t{bf8_uid});
+    }
+
+    // /// Test vector conversions
+
+    auto i = 256;
+
+    // bf8x2 -> fp32x2
+    EXPECT_EQ(out[i++], -powf(2.0f, -14.0f));
+    EXPECT_EQ(out[i++], powf(2.0f, -16.0f));
+
+    // fp32x2 -> bf8x2
+    // RNE
+    EXPECT_EQ(out[i++], -4.0f);
+    EXPECT_EQ(out[i++], 2.0f);
+    // SR
+    EXPECT_EQ(out[i++], -4.0f);
+    EXPECT_EQ(out[i++], 2.0f);
+
+    EXPECT_EQ(test_size, completed);
+    EXPECT_EQ(test_size, i);
+}
+
+__global__ void device_test_fp32_bf8_type_convert(uint64_t N, float* p_test, uint64_t* p_completed)
+{
+    test_fp32_bf8_type_convert(N, p_test, p_completed);
+}
+
+TEST(BF8OCP, DeviceFP32BF8Convert)
+{
+    std::vector<float> out(test_size, -1.0f);
+
+    DeviceMem device_out(test_size * sizeof(float));
+    DeviceMem device_completed(sizeof(uint64_t));
+
+    device_out.SetValue(-21.0f);
+    device_completed.SetValue(-21.0f);
+
+    device_test_fp32_bf8_type_convert<<<1, 1>>>(
+        test_size,
+        static_cast<float*>(device_out.GetDeviceBuffer()),
+        static_cast<uint64_t*>(device_completed.GetDeviceBuffer()));
+
+    uint64_t completed = 0;
+    device_completed.FromDevice(&completed);
+    device_out.FromDevice(out.data());
+
+    std::set<uint8_t> bf8_nan_ids;
+    bf8_nan_ids.insert(0b11111111);
+    bf8_nan_ids.insert(0b01111111);
+    bf8_nan_ids.insert(0b11111101);
+    bf8_nan_ids.insert(0b01111101);
+    bf8_nan_ids.insert(0b11111110);
+    bf8_nan_ids.insert(0b01111110);
+    for(auto bf8_nan_id : bf8_nan_ids)
+    {
+        auto idx = bf8_nan_id;
+        ASSERT_TRUE(std::isnan(out[idx])) << "idx: " << idx << " out[idx]: " << out[idx];
+    }
+
+    for(ck::index_t bf8_id = 0; bf8_id < 256; bf8_id++)
+    {
+        if(bf8_nan_ids.find(bf8_id) != bf8_nan_ids.end())
+            continue;
+
+        uint8_t bf8_uid = static_cast<uint8_t>(bf8_id);
+        auto idx        = bf8_uid;
+        ASSERT_FLOAT_EQ(out[idx], type_convert<float>(bf8_ocp_t{bf8_uid}))
+            << " bf8_id: " << bf8_id << std::endl
+            << type_convert<float>(bf8_ocp_t{bf8_uid});
+    }
+
+    /// Test vector conversions
+
+    auto i = 256;
+
+    // bf8x2 -> fp32x2
+    EXPECT_EQ(out[i++], -powf(2.0f, -14.0f));
+    EXPECT_EQ(out[i++], powf(2.0f, -16.0f));
+
+    // fp32x2 -> bf8x2
+    // RNE
+    EXPECT_EQ(out[i++], -4.0f);
+    EXPECT_EQ(out[i++], 2.0f);
+    // SR
+    EXPECT_EQ(out[i++], -4.0f);
+    EXPECT_EQ(out[i++], 2.0f);
+
+    EXPECT_EQ(test_size, completed);
+    EXPECT_EQ(test_size, i);
+}
+
+__host__ __device__ void
+test_fp16_bf8_type_convert(uint64_t N, half_t* p_test, uint64_t* p_completed)
+{
+    if(p_completed == nullptr)
+    {
+        return;
+    }
+
+    uint64_t& i = *p_completed;
+    i           = 0;
+
+    if(p_test == nullptr)
+    {
+        return;
+    }
+
+    for(ck::index_t bf8_id = 0; bf8_id < 256; bf8_id++)
+    {
+        uint8_t bf8_uid = static_cast<uint8_t>(bf8_id);
+        auto v          = type_convert<half_t>(bf8_ocp_t{bf8_uid});
+        p_test[i]       = v;
+        i++;
+        if(i >= N)
+        {
+            return;
+        }
+    }
+
+    /// Test vector conversion
+    // bf8x2 -> fp16x2
+    bf8x2_ocp_t bf8x2{bf8x2_ocp_t::data_v{0b10000100, 0b00000001}}; //-2^-14, 2^-16
+
+    half2_t f16x2 = type_convert<half2_t>(bf8x2);
+    p_test[i++]   = f16x2[0];
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = f16x2[1];
+    if(i >= N)
+    {
+        return;
+    }
+
+    // fp16x2 -> bf8x2
+    f16x2 = {-4.0f, 2.0f};
+    bf8x2 = f8_convert_rne<bf8x2_ocp_t>(f16x2); // expect {-4, 2}
+
+    p_test[i++] = type_convert<half_t>(bf8x2.AsType<bf8_ocp_t>()(ck::Number<0>{})); //-4f
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = type_convert<half_t>(bf8x2.AsType<bf8_ocp_t>()(ck::Number<1>{})); // 2f
+    if(i >= N)
+    {
+        return;
+    }
+
+    bf8x2 = f8_convert_sr<bf8x2_ocp_t>(f16x2); // expect {-4, 2}
+
+    p_test[i++] = type_convert<half_t>(bf8x2.AsType<bf8_ocp_t>()(ck::Number<0>{})); //-4f
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = type_convert<half_t>(bf8x2.AsType<bf8_ocp_t>()(ck::Number<1>{})); // 2f
+    if(i >= N)
+    {
+        return;
+    }
+}
+
+TEST(BF8OCP, HostFP16BF8Convert)
+{
+    std::vector<half_t> out(test_size, -1.0f);
+    uint64_t completed = 0;
+
+    test_fp16_bf8_type_convert(test_size, out.data(), &completed);
+
+    std::set<uint8_t> bf8_nan_ids;
+    bf8_nan_ids.insert(0b11111111);
+    bf8_nan_ids.insert(0b01111111);
+    bf8_nan_ids.insert(0b11111101);
+    bf8_nan_ids.insert(0b01111101);
+    bf8_nan_ids.insert(0b11111110);
+    bf8_nan_ids.insert(0b01111110);
+    for(auto bf8_nan_id : bf8_nan_ids)
+    {
+        auto idx = bf8_nan_id;
+        ASSERT_TRUE(std::isnan(type_convert<float>(out[idx])));
+    }
+
+    for(ck::index_t bf8_id = 0; bf8_id < 256; bf8_id++)
+    {
+        if(bf8_nan_ids.find(bf8_id) != bf8_nan_ids.end())
+            continue;
+
+        uint8_t bf8_uid = static_cast<uint8_t>(bf8_id);
+        auto idx        = bf8_uid;
+        ASSERT_FLOAT_EQ(out[idx], type_convert<half_t>(bf8_ocp_t{bf8_uid}))
+            << " bf8_id: " << bf8_id << std::endl
+            << type_convert<float>(type_convert<half_t>(bf8_ocp_t{bf8_uid}));
+    }
+
+    // /// Test vector conversions
+
+    auto i = 256;
+
+    // bf8x2 -> fp16x2
+    EXPECT_EQ(out[i++], type_convert<half_t>(-powf(2.0f, -14.0f)));
+    EXPECT_EQ(out[i++], type_convert<half_t>(powf(2.0f, -16.0f)));
+
+    // fp16x2 -> bf8x2
+    // RNE
+    EXPECT_EQ(out[i++], type_convert<half_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<half_t>(2.0f));
+    // SR
+    EXPECT_EQ(out[i++], type_convert<half_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<half_t>(2.0f));
+
+    EXPECT_EQ(test_size, completed);
+    EXPECT_EQ(test_size, i);
+}
+
+__global__ void device_test_fp16_bf8_type_convert(uint64_t N, half_t* p_test, uint64_t* p_completed)
+{
+    test_fp16_bf8_type_convert(N, p_test, p_completed);
+}
+
+TEST(BF8OCP, DeviceFP16BF8Convert)
+{
+    std::vector<half_t> out(test_size, -1.0f);
+
+    DeviceMem device_out(test_size * sizeof(half_t));
+    DeviceMem device_completed(sizeof(uint64_t));
+
+    device_out.SetValue(-21.0f);
+    device_completed.SetValue(-21.0f);
+
+    device_test_fp16_bf8_type_convert<<<1, 1>>>(
+        test_size,
+        static_cast<half_t*>(device_out.GetDeviceBuffer()),
+        static_cast<uint64_t*>(device_completed.GetDeviceBuffer()));
+
+    uint64_t completed = 0;
+    device_completed.FromDevice(&completed);
+    device_out.FromDevice(out.data());
+
+    std::set<uint8_t> bf8_nan_ids;
+    bf8_nan_ids.insert(0b11111111);
+    bf8_nan_ids.insert(0b01111111);
+    bf8_nan_ids.insert(0b11111101);
+    bf8_nan_ids.insert(0b01111101);
+    bf8_nan_ids.insert(0b11111110);
+    bf8_nan_ids.insert(0b01111110);
+    for(auto bf8_nan_id : bf8_nan_ids)
+    {
+        auto idx = bf8_nan_id;
+        ASSERT_TRUE(std::isnan(type_convert<float>(out[idx])))
+            << "idx: " << idx << " out[idx]: " << type_convert<float>(out[idx]);
+    }
+
+    for(ck::index_t bf8_id = 0; bf8_id < 256; bf8_id++)
+    {
+        if(bf8_nan_ids.find(bf8_id) != bf8_nan_ids.end())
+            continue;
+
+        uint8_t bf8_uid = static_cast<uint8_t>(bf8_id);
+        auto idx        = bf8_uid;
+        ASSERT_FLOAT_EQ(out[idx], type_convert<half_t>(bf8_ocp_t{bf8_uid}))
+            << " bf8_id: " << bf8_id << std::endl
+            << type_convert<float>(type_convert<half_t>(bf8_ocp_t{bf8_uid}));
+    }
+
+    /// Test vector conversions
+
+    auto i = 256;
+
+    // bf8x2 -> fp16x2
+    EXPECT_EQ(out[i++], type_convert<half_t>(-powf(2.0f, -14.0f)));
+    EXPECT_EQ(out[i++], type_convert<half_t>(powf(2.0f, -16.0f)));
+
+    // fp16x2 -> bf8x2
+    // RNE
+    EXPECT_EQ(out[i++], type_convert<half_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<half_t>(2.0f));
+    // SR
+    EXPECT_EQ(out[i++], type_convert<half_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<half_t>(2.0f));
+
+    EXPECT_EQ(test_size, completed);
+    EXPECT_EQ(test_size, i);
+}
+
+__host__ __device__ void
+test_bf16_bf8_type_convert(uint64_t N, bhalf_t* p_test, uint64_t* p_completed)
+{
+    if(p_completed == nullptr)
+    {
+        return;
+    }
+
+    uint64_t& i = *p_completed;
+    i           = 0;
+
+    if(p_test == nullptr)
+    {
+        return;
+    }
+
+    for(ck::index_t bf8_id = 0; bf8_id < 256; bf8_id++)
+    {
+        uint8_t bf8_uid = static_cast<uint8_t>(bf8_id);
+        auto v          = type_convert<bhalf_t>(bf8_ocp_t{bf8_uid});
+        p_test[i]       = v;
+        i++;
+        if(i >= N)
+        {
+            return;
+        }
+    }
+
+    /// Test vector conversion
+    // bf8x2 -> bf16x2
+    bf8x2_ocp_t bf8x2{bf8x2_ocp_t::data_v{0b10000100, 0b00000001}}; //-2^-14, 2^-16
+
+    bhalf2_t bf16x2 = type_convert<bhalf2_t>(bf8x2);
+    p_test[i++]     = bf16x2[0];
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = bf16x2[1];
+    if(i >= N)
+    {
+        return;
+    }
+
+    // bf16x2 -> bf8x2
+    bf16x2 = {type_convert<bhalf_t>(-4.0f), type_convert<bhalf_t>(2.0f)};
+    bf8x2  = f8_convert_rne<bf8x2_ocp_t>(bf16x2); // expect {-4, 2}
+
+    p_test[i++] = type_convert<bhalf_t>(bf8x2.AsType<bf8_ocp_t>()(ck::Number<0>{})); //-4f
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = type_convert<bhalf_t>(bf8x2.AsType<bf8_ocp_t>()(ck::Number<1>{})); // 2f
+    if(i >= N)
+    {
+        return;
+    }
+
+    bf8x2 = f8_convert_sr<bf8x2_ocp_t>(bf16x2); // expect {-4, 2}
+
+    p_test[i++] = type_convert<bhalf_t>(bf8x2.AsType<bf8_ocp_t>()(ck::Number<0>{})); //-4f
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = type_convert<bhalf_t>(bf8x2.AsType<bf8_ocp_t>()(ck::Number<1>{})); // 2f
+    if(i >= N)
+    {
+        return;
+    }
+}
+
+TEST(BF8OCP, HostBF16BF8Convert)
+{
+    std::vector<bhalf_t> out(test_size, -1.0f);
+    uint64_t completed = 0;
+
+    test_bf16_bf8_type_convert(test_size, out.data(), &completed);
+
+    std::set<uint8_t> bf8_nan_ids;
+    bf8_nan_ids.insert(0b11111111);
+    bf8_nan_ids.insert(0b01111111);
+    bf8_nan_ids.insert(0b11111101);
+    bf8_nan_ids.insert(0b01111101);
+    bf8_nan_ids.insert(0b11111110);
+    bf8_nan_ids.insert(0b01111110);
+    for(auto bf8_nan_id : bf8_nan_ids)
+    {
+        auto idx = bf8_nan_id;
+        ASSERT_TRUE(std::isnan(type_convert<float>(out[idx])));
+    }
+
+    for(ck::index_t bf8_id = 0; bf8_id < 256; bf8_id++)
+    {
+        if(bf8_nan_ids.find(bf8_id) != bf8_nan_ids.end())
+            continue;
+
+        uint8_t bf8_uid = static_cast<uint8_t>(bf8_id);
+        auto idx        = bf8_uid;
+        ASSERT_FLOAT_EQ(out[idx], type_convert<bhalf_t>(bf8_ocp_t{bf8_uid}))
+            << " bf8_id: " << bf8_id << std::endl
+            << type_convert<float>(type_convert<bhalf_t>(bf8_ocp_t{bf8_uid}));
+    }
+
+    // /// Test vector conversions
+
+    auto i = 256;
+
+    // bf8x2 -> bf16x2
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(-powf(2.0f, -14.0f)));
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(powf(2.0f, -16.0f)));
+
+    // bf16x2 -> bf8x2
+    // RNE
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(2.0f));
+    // SR
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(2.0f));
+
+    EXPECT_EQ(test_size, completed);
+    EXPECT_EQ(test_size, i);
+}
+
+__global__ void
+device_test_bf16_bf8_type_convert(uint64_t N, bhalf_t* p_test, uint64_t* p_completed)
+{
+    test_bf16_bf8_type_convert(N, p_test, p_completed);
+}
+
+TEST(BF8OCP, DeviceBF16BF8Convert)
+{
+    std::vector<bhalf_t> out(test_size, -1.0f);
+
+    DeviceMem device_out(test_size * sizeof(bhalf_t));
+    DeviceMem device_completed(sizeof(uint64_t));
+
+    device_out.SetValue(-21.0f);
+    device_completed.SetValue(-21.0f);
+
+    device_test_bf16_bf8_type_convert<<<1, 1>>>(
+        test_size,
+        static_cast<bhalf_t*>(device_out.GetDeviceBuffer()),
+        static_cast<uint64_t*>(device_completed.GetDeviceBuffer()));
+
+    uint64_t completed = 0;
+    device_completed.FromDevice(&completed);
+    device_out.FromDevice(out.data());
+
+    std::set<uint8_t> bf8_nan_ids;
+    bf8_nan_ids.insert(0b11111111);
+    bf8_nan_ids.insert(0b01111111);
+    bf8_nan_ids.insert(0b11111101);
+    bf8_nan_ids.insert(0b01111101);
+    bf8_nan_ids.insert(0b11111110);
+    bf8_nan_ids.insert(0b01111110);
+    for(auto bf8_nan_id : bf8_nan_ids)
+    {
+        auto idx = bf8_nan_id;
+        ASSERT_TRUE(std::isnan(type_convert<float>(out[idx])))
+            << "idx: " << idx << " out[idx]: " << type_convert<float>(out[idx]);
+    }
+
+    for(ck::index_t bf8_id = 0; bf8_id < 256; bf8_id++)
+    {
+        if(bf8_nan_ids.find(bf8_id) != bf8_nan_ids.end())
+            continue;
+
+        uint8_t bf8_uid = static_cast<uint8_t>(bf8_id);
+        auto idx        = bf8_uid;
+        ASSERT_FLOAT_EQ(out[idx], type_convert<bhalf_t>(bf8_ocp_t{bf8_uid}))
+            << " bf8_id: " << bf8_id << std::endl
+            << type_convert<float>(type_convert<bhalf_t>(bf8_ocp_t{bf8_uid}));
+    }
+
+    /// Test vector conversions
+
+    auto i = 256;
+
+    // bf8x2 -> bf16x2
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(-powf(2.0f, -14.0f)));
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(powf(2.0f, -16.0f)));
+
+    // bf16x2 -> bf8x2
+    // RNE
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(2.0f));
+    // SR
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(2.0f));
+
+    EXPECT_EQ(test_size, completed);
+    EXPECT_EQ(test_size, i);
+}
diff --git a/test/data_type/test_fp8_ocp.cpp b/test/data_type/test_fp8_ocp.cpp
index 944dd89930..bf562112c8 100644
--- a/test/data_type/test_fp8_ocp.cpp
+++ b/test/data_type/test_fp8_ocp.cpp
@@ -1,13 +1,19 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
+#include "ck/library/utility/device_memory.hpp"
 #include "ck/utility/data_type.hpp"
 #include "ck/utility/type_convert.hpp"
 
+using ck::bhalf2_t;
+using ck::bhalf_t;
 using ck::f8_convert_rne;
 using ck::f8_convert_sr;
 using ck::f8_ocp_t;
+using ck::f8x2_ocp_t;
+using ck::float2_t;
+using ck::half2_t;
 using ck::half_t;
 using ck::type_convert;
 
@@ -248,3 +254,566 @@ TEST(FP8OCP, ConvertFP16Stochastic)
     auto f8_nan = f8_convert_sr<f8_ocp_t>(ck::NumericLimits<half_t>::QuietNaN());
     ASSERT_TRUE(ck::fp8_impl::ocp_f8_is_nan(f8_nan.data));
 }
+
+constexpr uint64_t test_size = 256 + 6;
+
+__host__ __device__ void
+test_fp32_fp8_type_convert(uint64_t N, float* p_test, uint64_t* p_completed)
+{
+    if(p_completed == nullptr)
+    {
+        return;
+    }
+
+    uint64_t& i = *p_completed;
+    i           = 0;
+
+    if(p_test == nullptr)
+    {
+        return;
+    }
+
+    for(ck::index_t fp8_id = 0; fp8_id < 256; fp8_id++)
+    {
+        uint8_t fp8_uid = static_cast<uint8_t>(fp8_id);
+        auto v          = type_convert<float>(f8_ocp_t{fp8_uid});
+        p_test[i]       = v;
+        i++;
+        if(i >= N)
+        {
+            return;
+        }
+    }
+
+    /// Test vector conversion
+    // fp8x2 -> fp32x2
+    f8x2_ocp_t fp8x2{f8x2_ocp_t::data_v{0b10001000, 0b00000001}}; //-2^-6, 2^-9
+
+    float2_t f32x2 = type_convert<float2_t>(fp8x2);
+    p_test[i++]    = f32x2[0];
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = f32x2[1];
+    if(i >= N)
+    {
+        return;
+    }
+
+    // fp32x2 -> fp8x2
+    f32x2 = {-4.0f, 2.0f};
+    fp8x2 = f8_convert_rne<f8x2_ocp_t>(f32x2); // expect {-4, 2}
+
+    p_test[i++] = type_convert<float>(fp8x2.AsType<f8_ocp_t>()(ck::Number<0>{})); //-4f
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = type_convert<float>(fp8x2.AsType<f8_ocp_t>()(ck::Number<1>{})); // 2f
+    if(i >= N)
+    {
+        return;
+    }
+
+    fp8x2 = f8_convert_sr<f8x2_ocp_t>(f32x2); // expect {-4, 2}
+
+    p_test[i++] = type_convert<float>(fp8x2.AsType<f8_ocp_t>()(ck::Number<0>{})); //-4f
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = type_convert<float>(fp8x2.AsType<f8_ocp_t>()(ck::Number<1>{})); // 2f
+    if(i >= N)
+    {
+        return;
+    }
+}
+
+TEST(FP8OCP, HostFP32FP8Convert)
+{
+    std::vector<float> out(test_size, -1.0f);
+    uint64_t completed = 0;
+
+    test_fp32_fp8_type_convert(test_size, out.data(), &completed);
+
+    std::set<uint8_t> fp8_nan_ids;
+    fp8_nan_ids.insert(0b11111111); //-NaN
+    fp8_nan_ids.insert(0b01111111); // +NaN
+    for(auto fp8_nan_id : fp8_nan_ids)
+    {
+        auto idx = fp8_nan_id;
+        ASSERT_TRUE(std::isnan(out[idx]));
+    }
+
+    for(ck::index_t fp8_id = 0; fp8_id < 256; fp8_id++)
+    {
+        if(fp8_nan_ids.find(fp8_id) != fp8_nan_ids.end())
+            continue;
+
+        uint8_t fp8_uid = static_cast<uint8_t>(fp8_id);
+        auto idx        = fp8_uid;
+        ASSERT_FLOAT_EQ(out[idx], type_convert<float>(f8_ocp_t{fp8_uid}))
+            << " fp8_id: " << fp8_id << std::endl
+            << type_convert<float>(f8_ocp_t{fp8_uid});
+    }
+
+    // /// Test vector conversions
+
+    auto i = 256;
+
+    // fp8x2 -> fp32x2
+    EXPECT_EQ(out[i++], -powf(2.0f, -6.0f));
+    EXPECT_EQ(out[i++], powf(2.0f, -9.0f));
+
+    // fp32x2 -> fp8x2
+    // RNE
+    EXPECT_EQ(out[i++], -4.0f);
+    EXPECT_EQ(out[i++], 2.0f);
+    // SR
+    EXPECT_EQ(out[i++], -4.0f);
+    EXPECT_EQ(out[i++], 2.0f);
+
+    EXPECT_EQ(test_size, completed);
+    EXPECT_EQ(test_size, i);
+}
+
+__global__ void device_test_fp32_fp8_type_convert(uint64_t N, float* p_test, uint64_t* p_completed)
+{
+    test_fp32_fp8_type_convert(N, p_test, p_completed);
+}
+
+TEST(FP8OCP, DeviceFP32FP8Convert)
+{
+    std::vector<float> out(test_size, -1.0f);
+
+    DeviceMem device_out(test_size * sizeof(float));
+    DeviceMem device_completed(sizeof(uint64_t));
+
+    device_out.SetValue(-21.0f);
+    device_completed.SetValue(-21.0f);
+
+    device_test_fp32_fp8_type_convert<<<1, 1>>>(
+        test_size,
+        static_cast<float*>(device_out.GetDeviceBuffer()),
+        static_cast<uint64_t*>(device_completed.GetDeviceBuffer()));
+
+    uint64_t completed = 0;
+    device_completed.FromDevice(&completed);
+    device_out.FromDevice(out.data());
+
+    std::set<uint8_t> fp8_nan_ids;
+    fp8_nan_ids.insert(0b11111111); //-NaN
+    fp8_nan_ids.insert(0b01111111); // +NaN
+    for(auto fp8_nan_id : fp8_nan_ids)
+    {
+        auto idx = fp8_nan_id;
+        ASSERT_TRUE(std::isnan(out[idx])) << "idx: " << idx << " out[idx]: " << out[idx];
+    }
+
+    for(ck::index_t fp8_id = 0; fp8_id < 256; fp8_id++)
+    {
+        if(fp8_nan_ids.find(fp8_id) != fp8_nan_ids.end())
+            continue;
+
+        uint8_t fp8_uid = static_cast<uint8_t>(fp8_id);
+        auto idx        = fp8_uid;
+        ASSERT_FLOAT_EQ(out[idx], type_convert<float>(f8_ocp_t{fp8_uid}))
+            << " fp8_id: " << fp8_id << std::endl
+            << type_convert<float>(f8_ocp_t{fp8_uid});
+    }
+
+    /// Test vector conversions
+
+    auto i = 256;
+
+    // fp8x2 -> fp32x2
+    EXPECT_EQ(out[i++], -powf(2.0f, -6.0f));
+    EXPECT_EQ(out[i++], powf(2.0f, -9.0f));
+
+    // fp32x2 -> fp8x2
+    // RNE
+    EXPECT_EQ(out[i++], -4.0f);
+    EXPECT_EQ(out[i++], 2.0f);
+    // SR
+    EXPECT_EQ(out[i++], -4.0f);
+    EXPECT_EQ(out[i++], 2.0f);
+
+    EXPECT_EQ(test_size, completed);
+    EXPECT_EQ(test_size, i);
+}
+
+__host__ __device__ void
+test_fp16_fp8_type_convert(uint64_t N, half_t* p_test, uint64_t* p_completed)
+{
+    if(p_completed == nullptr)
+    {
+        return;
+    }
+
+    uint64_t& i = *p_completed;
+    i           = 0;
+
+    if(p_test == nullptr)
+    {
+        return;
+    }
+
+    for(ck::index_t fp8_id = 0; fp8_id < 256; fp8_id++)
+    {
+        uint8_t fp8_uid = static_cast<uint8_t>(fp8_id);
+        auto v          = type_convert<half_t>(f8_ocp_t{fp8_uid});
+        p_test[i]       = v;
+        i++;
+        if(i >= N)
+        {
+            return;
+        }
+    }
+
+    /// Test vector conversion
+    // fp8x2 -> fp16x2
+    f8x2_ocp_t fp8x2{f8x2_ocp_t::data_v{0b10001000, 0b00000001}}; //-2^-6, 2^-9
+
+    half2_t f16x2 = type_convert<half2_t>(fp8x2);
+    p_test[i++]   = f16x2[0];
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = f16x2[1];
+    if(i >= N)
+    {
+        return;
+    }
+
+    // fp16x2 -> fp8x2
+    f16x2 = {-4.0f, 2.0f};
+    fp8x2 = f8_convert_rne<f8x2_ocp_t>(f16x2); // expect {-4, 2}
+
+    p_test[i++] = type_convert<half_t>(fp8x2.AsType<f8_ocp_t>()(ck::Number<0>{})); //-4f
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = type_convert<half_t>(fp8x2.AsType<f8_ocp_t>()(ck::Number<1>{})); // 2f
+    if(i >= N)
+    {
+        return;
+    }
+
+    fp8x2 = f8_convert_sr<f8x2_ocp_t>(f16x2); // expect {-4, 2}
+
+    p_test[i++] = type_convert<half_t>(fp8x2.AsType<f8_ocp_t>()(ck::Number<0>{})); //-4f
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = type_convert<half_t>(fp8x2.AsType<f8_ocp_t>()(ck::Number<1>{})); // 2f
+    if(i >= N)
+    {
+        return;
+    }
+}
+
+TEST(FP8OCP, HostFP16FP8Convert)
+{
+    std::vector<half_t> out(test_size, -1.0f);
+    uint64_t completed = 0;
+
+    test_fp16_fp8_type_convert(test_size, out.data(), &completed);
+
+    std::set<uint8_t> fp8_nan_ids;
+    fp8_nan_ids.insert(0b11111111); //-NaN
+    fp8_nan_ids.insert(0b01111111); // +NaN
+    for(auto fp8_nan_id : fp8_nan_ids)
+    {
+        auto idx = fp8_nan_id;
+        ASSERT_TRUE(std::isnan(type_convert<float>(out[idx])));
+    }
+
+    for(ck::index_t fp8_id = 0; fp8_id < 256; fp8_id++)
+    {
+        if(fp8_nan_ids.find(fp8_id) != fp8_nan_ids.end())
+            continue;
+
+        uint8_t fp8_uid = static_cast<uint8_t>(fp8_id);
+        auto idx        = fp8_uid;
+        ASSERT_FLOAT_EQ(out[idx], type_convert<half_t>(f8_ocp_t{fp8_uid}))
+            << " fp8_id: " << fp8_id << std::endl
+            << type_convert<float>(type_convert<half_t>(f8_ocp_t{fp8_uid}));
+    }
+
+    // /// Test vector conversions
+
+    auto i = 256;
+
+    // fp8x2 -> fp16x2
+    EXPECT_EQ(out[i++], type_convert<half_t>(-powf(2.0f, -6.0f)));
+    EXPECT_EQ(out[i++], type_convert<half_t>(powf(2.0f, -9.0f)));
+
+    // fp16x2 -> fp8x2
+    // RNE
+    EXPECT_EQ(out[i++], type_convert<half_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<half_t>(2.0f));
+    // SR
+    EXPECT_EQ(out[i++], type_convert<half_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<half_t>(2.0f));
+
+    EXPECT_EQ(test_size, completed);
+    EXPECT_EQ(test_size, i);
+}
+
+__global__ void device_test_fp16_fp8_type_convert(uint64_t N, half_t* p_test, uint64_t* p_completed)
+{
+    test_fp16_fp8_type_convert(N, p_test, p_completed);
+}
+
+TEST(FP8OCP, DeviceFP16FP8Convert)
+{
+    std::vector<half_t> out(test_size, -1.0f);
+
+    DeviceMem device_out(test_size * sizeof(half_t));
+    DeviceMem device_completed(sizeof(uint64_t));
+
+    device_out.SetValue(-21.0f);
+    device_completed.SetValue(-21.0f);
+
+    device_test_fp16_fp8_type_convert<<<1, 1>>>(
+        test_size,
+        static_cast<half_t*>(device_out.GetDeviceBuffer()),
+        static_cast<uint64_t*>(device_completed.GetDeviceBuffer()));
+
+    uint64_t completed = 0;
+    device_completed.FromDevice(&completed);
+    device_out.FromDevice(out.data());
+
+    std::set<uint8_t> fp8_nan_ids;
+    fp8_nan_ids.insert(0b11111111); //-NaN
+    fp8_nan_ids.insert(0b01111111); // +NaN
+    for(auto fp8_nan_id : fp8_nan_ids)
+    {
+        auto idx = fp8_nan_id;
+        ASSERT_TRUE(std::isnan(type_convert<float>(out[idx])))
+            << "idx: " << idx << " out[idx]: " << type_convert<float>(out[idx]);
+    }
+
+    for(ck::index_t fp8_id = 0; fp8_id < 256; fp8_id++)
+    {
+        if(fp8_nan_ids.find(fp8_id) != fp8_nan_ids.end())
+            continue;
+
+        uint8_t fp8_uid = static_cast<uint8_t>(fp8_id);
+        auto idx        = fp8_uid;
+        ASSERT_FLOAT_EQ(out[idx], type_convert<half_t>(f8_ocp_t{fp8_uid}))
+            << " fp8_id: " << fp8_id << std::endl
+            << type_convert<float>(type_convert<half_t>(f8_ocp_t{fp8_uid}));
+    }
+
+    /// Test vector conversions
+
+    auto i = 256;
+
+    // fp8x2 -> fp16x2
+    EXPECT_EQ(out[i++], type_convert<half_t>(-powf(2.0f, -6.0f)));
+    EXPECT_EQ(out[i++], type_convert<half_t>(powf(2.0f, -9.0f)));
+
+    // fp16x2 -> fp8x2
+    // RNE
+    EXPECT_EQ(out[i++], type_convert<half_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<half_t>(2.0f));
+    // SR
+    EXPECT_EQ(out[i++], type_convert<half_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<half_t>(2.0f));
+
+    EXPECT_EQ(test_size, completed);
+    EXPECT_EQ(test_size, i);
+}
+
+__host__ __device__ void
+test_bf16_fp8_type_convert(uint64_t N, bhalf_t* p_test, uint64_t* p_completed)
+{
+    if(p_completed == nullptr)
+    {
+        return;
+    }
+
+    uint64_t& i = *p_completed;
+    i           = 0;
+
+    if(p_test == nullptr)
+    {
+        return;
+    }
+
+    for(ck::index_t fp8_id = 0; fp8_id < 256; fp8_id++)
+    {
+        uint8_t fp8_uid = static_cast<uint8_t>(fp8_id);
+        auto v          = type_convert<bhalf_t>(f8_ocp_t{fp8_uid});
+        p_test[i]       = v;
+        i++;
+        if(i >= N)
+        {
+            return;
+        }
+    }
+
+    /// Test vector conversion
+    // fp8x2 -> bf16x2
+    f8x2_ocp_t fp8x2{f8x2_ocp_t::data_v{0b10001000, 0b00000001}}; //-2^-6, 2^-9
+
+    bhalf2_t bf16x2 = type_convert<bhalf2_t>(fp8x2);
+    p_test[i++]     = bf16x2[0];
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = bf16x2[1];
+    if(i >= N)
+    {
+        return;
+    }
+
+    // bf16x2 -> fp8x2
+    bf16x2 = {type_convert<bhalf_t>(-4.0f), type_convert<bhalf_t>(2.0f)};
+    fp8x2  = f8_convert_rne<f8x2_ocp_t>(bf16x2); // expect {-4, 2}
+
+    p_test[i++] = type_convert<bhalf_t>(fp8x2.AsType<f8_ocp_t>()(ck::Number<0>{})); //-4f
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = type_convert<bhalf_t>(fp8x2.AsType<f8_ocp_t>()(ck::Number<1>{})); // 2f
+    if(i >= N)
+    {
+        return;
+    }
+
+    fp8x2 = f8_convert_sr<f8x2_ocp_t>(bf16x2); // expect {-4, 2}
+
+    p_test[i++] = type_convert<bhalf_t>(fp8x2.AsType<f8_ocp_t>()(ck::Number<0>{})); //-4f
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = type_convert<bhalf_t>(fp8x2.AsType<f8_ocp_t>()(ck::Number<1>{})); // 2f
+    if(i >= N)
+    {
+        return;
+    }
+}
+
+TEST(FP8OCP, HostBF16FP8Convert)
+{
+    std::vector<bhalf_t> out(test_size, -1.0f);
+    uint64_t completed = 0;
+
+    test_bf16_fp8_type_convert(test_size, out.data(), &completed);
+
+    std::set<uint8_t> fp8_nan_ids;
+    fp8_nan_ids.insert(0b11111111); //-NaN
+    fp8_nan_ids.insert(0b01111111); // +NaN
+    for(auto fp8_nan_id : fp8_nan_ids)
+    {
+        auto idx = fp8_nan_id;
+        ASSERT_TRUE(std::isnan(type_convert<float>(out[idx])));
+    }
+
+    for(ck::index_t fp8_id = 0; fp8_id < 256; fp8_id++)
+    {
+        if(fp8_nan_ids.find(fp8_id) != fp8_nan_ids.end())
+            continue;
+
+        uint8_t fp8_uid = static_cast<uint8_t>(fp8_id);
+        auto idx        = fp8_uid;
+        ASSERT_FLOAT_EQ(out[idx], type_convert<bhalf_t>(f8_ocp_t{fp8_uid}))
+            << " fp8_id: " << fp8_id << std::endl
+            << type_convert<float>(type_convert<bhalf_t>(f8_ocp_t{fp8_uid}));
+    }
+
+    // /// Test vector conversions
+
+    auto i = 256;
+
+    // fp8x2 -> bf16x2
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(-powf(2.0f, -6.0f)));
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(powf(2.0f, -9.0f)));
+
+    // bf16x2 -> fp8x2
+    // RNE
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(2.0f));
+    // SR
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(2.0f));
+
+    EXPECT_EQ(test_size, completed);
+    EXPECT_EQ(test_size, i);
+}
+
+__global__ void
+device_test_bf16_fp8_type_convert(uint64_t N, bhalf_t* p_test, uint64_t* p_completed)
+{
+    test_bf16_fp8_type_convert(N, p_test, p_completed);
+}
+
+TEST(FP8OCP, DeviceBF16FP8Convert)
+{
+    std::vector<bhalf_t> out(test_size, -1.0f);
+
+    DeviceMem device_out(test_size * sizeof(bhalf_t));
+    DeviceMem device_completed(sizeof(uint64_t));
+
+    device_out.SetValue(-21.0f);
+    device_completed.SetValue(-21.0f);
+
+    device_test_bf16_fp8_type_convert<<<1, 1>>>(
+        test_size,
+        static_cast<bhalf_t*>(device_out.GetDeviceBuffer()),
+        static_cast<uint64_t*>(device_completed.GetDeviceBuffer()));
+
+    uint64_t completed = 0;
+    device_completed.FromDevice(&completed);
+    device_out.FromDevice(out.data());
+
+    std::set<uint8_t> fp8_nan_ids;
+    fp8_nan_ids.insert(0b11111111); //-NaN
+    fp8_nan_ids.insert(0b01111111); // +NaN
+    for(auto fp8_nan_id : fp8_nan_ids)
+    {
+        auto idx = fp8_nan_id;
+        ASSERT_TRUE(std::isnan(type_convert<float>(out[idx])))
+            << "idx: " << idx << " out[idx]: " << type_convert<float>(out[idx]);
+    }
+
+    for(ck::index_t fp8_id = 0; fp8_id < 256; fp8_id++)
+    {
+        if(fp8_nan_ids.find(fp8_id) != fp8_nan_ids.end())
+            continue;
+
+        uint8_t fp8_uid = static_cast<uint8_t>(fp8_id);
+        auto idx        = fp8_uid;
+        ASSERT_FLOAT_EQ(out[idx], type_convert<bhalf_t>(f8_ocp_t{fp8_uid}))
+            << " fp8_id: " << fp8_id << std::endl
+            << type_convert<float>(type_convert<bhalf_t>(f8_ocp_t{fp8_uid}));
+    }
+
+    /// Test vector conversions
+
+    auto i = 256;
+
+    // fp8x2 -> bf16x2
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(-powf(2.0f, -6.0f)));
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(powf(2.0f, -9.0f)));
+
+    // bf16x2 -> fp8x2
+    // RNE
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(2.0f));
+    // SR
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(2.0f));
+
+    EXPECT_EQ(test_size, completed);
+    EXPECT_EQ(test_size, i);
+}
diff --git a/test/data_type/test_mx_fp4.cpp b/test/data_type/test_mx_fp4.cpp
index 449f6fc777..a4fe044bc5 100644
--- a/test/data_type/test_mx_fp4.cpp
+++ b/test/data_type/test_mx_fp4.cpp
@@ -240,6 +240,10 @@ TEST(MXFP4, HostScaledConvert)
     EXPECT_EQ(test_size, i);
 }
 
+<<<<<<< HEAD
+=======
+#if !CK_TEMP_DISABLE_FP4_TESTS
+>>>>>>> develop
 __global__ void test_mx_fp4_device_scaled_convert(uint64_t N, float* p_test, uint64_t* p_completed)
 {
     test_mx_fp4_scaled_convert(N, p_test, p_completed);
@@ -539,3 +543,7 @@ TEST(MXFP4, DeviceF4x32ToF32x32ScaledConvert)
     EXPECT_EQ(N, completed);
     EXPECT_EQ(N, i);
 }
+<<<<<<< HEAD
+=======
+#endif // CK_TEMP_DISABLE_FP4_TESTS
+>>>>>>> develop
diff --git a/test/gemm_mx/CMakeLists.txt b/test/gemm_mx/CMakeLists.txt
new file mode 100644
index 0000000000..71a0a98f2d
--- /dev/null
+++ b/test/gemm_mx/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_gtest_executable(test_gemm_mx test_gemm_mx.cpp)
+if(result EQUAL 0)
+   target_link_libraries(test_gemm_mx PRIVATE utility device_gemm_mx_instance)
+ endif()
diff --git a/test/gemm_mx/test_gemm_mx.cpp b/test/gemm_mx/test_gemm_mx.cpp
new file mode 100644
index 0000000000..6e1957e60a
--- /dev/null
+++ b/test/gemm_mx/test_gemm_mx.cpp
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+
+#include "test_gemm_mx_util.hpp"
+
+using E8M0 = ck::e8m0_bexp_t;
+using F8   = ck::f8_t;
+using BF8  = ck::bf8_t;
+using F6   = ck::f6_t;
+using BF6  = ck::bf6_t;
+using F4   = ck::f4_t;
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+namespace {
+
+template <typename X, typename Y>
+struct tuple_concat;
+
+template <typename... Xs, typename... Ys>
+struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
+{
+    using type = std::tuple<Xs..., Ys...>;
+};
+
+} // namespace
+
+template <typename Tuple>
+class TestGemmMX_MK_NK
+    : public ck::test::TestGemmMX<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
+{
+};
+
+// clang-format off
+using KernelTypes_MK_NK = ::testing::Types<
+#if defined(CK_ENABLE_FP8)
+    //         ADataType, BDataType,       CDataType, ScaleBlockSize
+    std::tuple<       F8,        F8,             F16, ck::Number<32> >,
+    std::tuple<       F8,        F8,            BF16, ck::Number<32> >
+#endif
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestGemmMX_MK_NK, KernelTypes_MK_NK);
+
+TYPED_TEST(TestGemmMX_MK_NK, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 256;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmMX_MK_NK, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 256;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmMX_MK_NK, Regular)
+{
+    std::vector<int> Ms{3840};
+    constexpr int N = 512;
+    constexpr int K = 1024;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmMX_MK_NK, Large)
+{
+    std::vector<int> Ms{4096};
+    constexpr int N = 3840;
+    constexpr int K = 4096;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
diff --git a/test/gemm_mx/test_gemm_mx_util.hpp b/test/gemm_mx/test_gemm_mx_util.hpp
new file mode 100644
index 0000000000..3bca4ceded
--- /dev/null
+++ b/test/gemm_mx/test_gemm_mx_util.hpp
@@ -0,0 +1,498 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <gtest/gtest.h>
+
+#include "ck/utility/data_type.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/utility/number.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/fill.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_mx.hpp"
+#include "ck/library/tensor_operation_instance/gpu/gemm_mx.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_mx_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+namespace ck {
+namespace test {
+
+namespace {
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+} // namespace
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          int ScaleBlockSize>
+bool profile_gemm_mx_impl(int do_verification,
+                          int init_method,
+                          bool do_log,
+                          bool time_kernel,
+                          int M,
+                          int N,
+                          int K,
+                          int StrideA,
+                          int StrideB,
+                          int StrideC,
+                          int KBatch,
+                          int n_warmup,
+                          int n_iter,
+                          uint64_t rotating = 0)
+{
+    if(K % ScaleBlockSize != 0)
+    {
+        throw std::runtime_error("wrong! K must be multiple of ScaleBlockSize.");
+    };
+
+    using ScaleDataType = e8m0_bexp_t;
+    using AScaleLayout  = Row;
+    using BScaleLayout  = Col;
+
+    bool pass = true;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+    auto f_get_default_stride =
+        [](ck::index_t row, ck::index_t col, ck::index_t stride, auto layout) {
+            if(stride == -1)
+            {
+                // give a chance if stride is -1, return a default packed stride
+                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+                {
+                    return static_cast<ck::index_t>(col);
+                }
+                else
+                {
+                    return static_cast<ck::index_t>(row);
+                }
+            }
+            else
+                return static_cast<ck::index_t>(stride);
+        };
+
+    auto Scale_Stride_AM = f_get_default_stride(M, K / ScaleBlockSize, -1, AScaleLayout{});
+    auto Scale_Stride_BN = f_get_default_stride(K / ScaleBlockSize, N, -1, BScaleLayout{});
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+
+    Tensor<ScaleDataType> a_m_k_scale(f_host_tensor_descriptor(
+        M, K / ScaleBlockSize, Scale_Stride_AM, AScaleLayout{})); // scales for A
+    Tensor<ScaleDataType> b_k_n_scale(f_host_tensor_descriptor(
+        K / ScaleBlockSize, N, Scale_Stride_BN, BScaleLayout{})); // scales for B
+
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::size_t total_gemm_needed =
+        a_m_k.GetElementSpaceSizeInBytes() + b_k_n.GetElementSpaceSizeInBytes() +
+        a_m_k_scale.GetElementSpaceSizeInBytes() + b_k_n_scale.GetElementSpaceSizeInBytes();
+    int rotating_count = std::max(
+        1,
+        std::min(n_iter,
+                 static_cast<int>(std::ceil(static_cast<double>(rotating) / total_gemm_needed))));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "a_m_k_scale: " << a_m_k_scale.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "b_k_n_scale: " << b_k_n_scale.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl;
+    std::cout << "rotating count: " << rotating_count << std::endl;
+
+    switch(init_method)
+    {
+    case 0: // Initializations for development and debugging
+        ck::utils::FillConstant<ADataType>{ck::type_convert<ADataType>(1.0f)}(a_m_k);
+        ck::utils::FillConstant<ScaleDataType>{ck::type_convert<ScaleDataType>(2.0f)}(a_m_k_scale);
+        ck::utils::FillConstant<BDataType>{ck::type_convert<BDataType>(0.5f)}(b_k_n);
+        ck::utils::FillConstant<ScaleDataType>{ck::type_convert<ScaleDataType>(1.0f)}(b_k_n_scale);
+        if(do_log)
+        {
+            std::cout << "Init A = {1}" << std::endl;
+            std::cout << "Init A scale = {2.0}" << std::endl;
+            std::cout << "Init B = {0.5}" << std::endl;
+            std::cout << "Init B scale = {1.0}" << std::endl;
+            std::cout << "Expect C = {K}" << std::endl;
+        }
+        break;
+
+    case 1:
+
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-4, 5}); // Z[-4,4]
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-4, 5}); // Z[-4,4]
+
+        a_m_k_scale.GenerateTensorValue(
+            GeneratorTensor_2<ScaleDataType>{125, 129}); // scales: {0.25, 0.5, 1, 2}
+        b_k_n_scale.GenerateTensorValue(
+            GeneratorTensor_2<ScaleDataType>{125, 129}); // scales: {0.25, 0.5, 1, 2}
+
+        break;
+
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<BDataType>{-2.0, 2.0});
+        a_m_k_scale.GenerateTensorValue(
+            GeneratorTensor_3<ScaleDataType>{powf(2.0f, -125.0f), 1.0f}); // R[2^-125, 1]
+
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-2.0, 2.0});
+        b_k_n_scale.GenerateTensorValue(
+            GeneratorTensor_3<ScaleDataType>{powf(2.0f, -125.0f), 1.0f});
+        break;
+    }
+
+    using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+    const auto a_element_op = AElementOp{};
+    const auto b_element_op = BElementOp{};
+    const auto c_element_op = CElementOp{};
+
+    if(do_log > 0)
+        std::cout << "Device memory allocation..." << std::endl;
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem a_scale_device_buf(sizeof(ScaleDataType) * a_m_k_scale.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem b_scale_device_buf(sizeof(ScaleDataType) * b_k_n_scale.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    if(do_log > 0)
+        std::cout << "Upload data to device..." << std::endl;
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    a_scale_device_buf.ToDevice(a_m_k_scale.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    b_scale_device_buf.ToDevice(b_k_n_scale.mData.data());
+
+    if(do_log > 0)
+        std::cout << "Done." << std::endl;
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmMX<ALayout,
+                                                                BLayout,
+                                                                CLayout,
+                                                                ADataType,
+                                                                ScaleDataType,
+                                                                BDataType,
+                                                                ScaleDataType,
+                                                                CDataType,
+                                                                ScaleBlockSize,
+                                                                AElementOp,
+                                                                BElementOp,
+                                                                CElementOp>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    // Run reference GEMM
+    if(do_verification)
+    {
+        using ReferenceGemmInstance =
+            ck::tensor_operation::host::ReferenceMXGemm<ADataType,
+                                                        BDataType,
+                                                        CDataType,
+                                                        float, // AccDataType
+                                                        ScaleDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CElementOp,
+                                                        float, // ComputeTypeA
+                                                        float  // ComputeTypeB
+                                                        >;
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(a_m_k,
+                                                  a_m_k_scale,
+                                                  b_k_n,
+                                                  b_k_n_scale,
+                                                  c_m_n_host_result,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  c_element_op);
+
+        ref_invoker.Run(ref_argument);
+    }
+
+    std::string best_op_name;
+    std::optional<std::string> best_op_object_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+    float best_kbatch     = 0;
+
+    // profile device GEMM instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        std::vector<int> kbatch_list = {1, 2, 4, 8, 16, 19, 32, 38}; // use these when KBatch <= 0
+
+        if(KBatch > 0)
+        {
+            kbatch_list = {KBatch};
+        }
+
+        for(std::size_t i = 0; i < kbatch_list.size(); i++)
+        {
+            auto kbatch_curr = kbatch_list[i];
+
+            auto argument_ptr = op_ptr->MakeArgumentPointer(
+                static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                static_cast<ScaleDataType*>(a_scale_device_buf.GetDeviceBuffer()),
+                static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                static_cast<ScaleDataType*>(b_scale_device_buf.GetDeviceBuffer()),
+                static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                M,
+                N,
+                K,
+                StrideA,
+                Scale_Stride_AM,
+                StrideB,
+                Scale_Stride_BN,
+                StrideC,
+                kbatch_curr,
+                a_element_op,
+                b_element_op,
+                c_element_op);
+
+            auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+            if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+            {
+
+                // re-init C to zero before profiling next kernel
+                c_device_buf.SetZero();
+
+                invoker_ptr->Run(argument_ptr.get(),
+                                 StreamConfig{nullptr, false, 0, n_warmup, n_iter});
+
+                if(do_verification)
+                {
+                    c_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+                    if(do_log)
+                    {
+
+                        if(init_method == 0)
+                        {
+                            auto expected = static_cast<float>(K);
+                            auto computed = type_convert<float>(c_m_n_device_result(0, 12));
+
+                            pass = pass & (std::abs(expected - computed) <= 0.0f);
+                            std::cout << "\nExpected vs Computed: " << expected << " vs "
+                                      << computed << ((pass) ? " (PASSED!)" : " (FAILED!)")
+                                      << std::endl
+                                      << std::endl;
+                        }
+                        else
+                        {
+                            LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",")
+                                << std::endl;
+                            LogRangeAsType<float>(std::cout << "a_scale : ", a_m_k_scale.mData, ",")
+                                << std::endl;
+                            LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",")
+                                << std::endl;
+                            LogRangeAsType<float>(std::cout << "b_scale: ", b_k_n_scale.mData, ",")
+                                << std::endl;
+                            LogRangeAsType<float>(
+                                std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
+                                << std::endl;
+                            LogRangeAsType<float>(
+                                std::cout << "c_device: ", c_m_n_device_result.mData, ",")
+                                << std::endl;
+                        }
+                    }
+
+                    pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+                }
+
+                std::string op_name                    = op_ptr->GetTypeString();
+                std::optional<std::string> op_obj_name = op_ptr->GetObjectName();
+
+                float ave_time = invoker_ptr->Run(argument_ptr.get(),
+                                                  StreamConfig{nullptr,
+                                                               time_kernel,
+                                                               0,
+                                                               n_warmup,
+                                                               n_iter,
+                                                               rotating_count > 1,
+                                                               rotating_count});
+
+                // Output size(M*N) * [dot product(2K) + product of scales(K/ScaleBlockSize) +
+                // scaling of partial sums(K/ScaleBlockSize)]
+                // FLOPS = 2 * M * N * K + 2 * M * N * K / ScaleBlockSize
+                std::size_t flop =
+                    std::size_t(2) * M * N * K + std::size_t(2) * M * N * K / ScaleBlockSize;
+
+                std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                                        sizeof(CDataType) * M * N +
+                                        sizeof(ScaleDataType) * (M * K + K * N) / ScaleBlockSize;
+
+                float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+                float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+                std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops
+                          << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", KBatch "
+                          << kbatch_curr << std::endl;
+
+                if(tflops > best_tflops && ave_time > 1e-10)
+                {
+                    best_op_name        = op_name;
+                    best_op_object_name = op_obj_name;
+                    best_tflops         = tflops;
+                    best_ave_time       = ave_time;
+                    best_gb_per_sec     = gb_per_sec;
+                    best_kbatch         = kbatch_curr;
+                }
+            }
+            else
+            {
+                std::cout << op_ptr->GetTypeString() << " does not support this problem"
+                          << std::endl;
+            }
+        }
+    }
+
+    if constexpr(is_same<CDataType, float>::value)
+    {
+        std::cout << "Best Perf for datatype = f32";
+    }
+    else if constexpr(is_same<CDataType, half_t>::value)
+    {
+        std::cout << "Best Perf for datatype = f16";
+    }
+    else if constexpr(is_same<CDataType, bhalf_t>::value)
+    {
+        std::cout << "Best Perf for datatype = bf16";
+    }
+    else if constexpr(is_same<CDataType, int8_t>::value)
+    {
+        std::cout << "Best Perf for datatype = int8";
+    }
+
+    if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value)
+    {
+        std::cout << " ALayout =  RowMajor";
+    }
+    else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value)
+    {
+        std::cout << " ALayout =  ColumnMajor";
+    }
+
+    if constexpr(is_same<BLayout, tensor_layout::gemm::RowMajor>::value)
+    {
+        std::cout << " BLayout =  RowMajor";
+    }
+    else if constexpr(is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value)
+    {
+        std::cout << " BLayout =  ColumnMajor";
+    }
+
+    std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA
+              << " StrideB = " << StrideB << " StrideC = " << StrideC << " KBatch = " << best_kbatch
+              << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
+              << " GB/s, " << best_op_name << std::endl;
+
+    if(best_op_object_name)
+        std::cout << best_op_object_name.value() << std::endl;
+
+    return pass;
+}
+
+template <typename Tuple>
+class TestGemmMX : public testing::Test
+{
+    using Row       = ck::tensor_layout::gemm::RowMajor;
+    using F32       = float;
+    using ScaleType = e8m0_bexp_t;
+
+    protected:
+    using ALayout     = std::tuple_element_t<0, Tuple>;
+    using BLayout     = std::tuple_element_t<1, Tuple>;
+    using CLayout     = Row;
+    using ADataType   = std::tuple_element_t<2, Tuple>;
+    using BDataType   = std::tuple_element_t<3, Tuple>;
+    using CDataType   = std::tuple_element_t<4, Tuple>;
+    using AccDataType = float;
+
+    public:
+    static constexpr index_t ScaleBlockSize = std::tuple_element_t<5, Tuple>{};
+    static constexpr bool verify_           = true;
+    static constexpr int init_method_       = 2; // decimal value initialization
+    static constexpr bool log_              = false;
+    static constexpr bool bench_            = false; // measure kernel performance
+    std::vector<int> k_batches_;
+
+    void SetUp() override { k_batches_ = {1}; }
+
+    void Run(const int M,
+             const int N,
+             const int K,
+             const int StrideA,
+             const int StrideB,
+             const int StrideC)
+    {
+        for(auto kb : k_batches_)
+        {
+            RunSingle(M, N, K, StrideA, StrideB, StrideC, kb);
+        }
+    }
+
+    void RunSingle(const int M,
+                   const int N,
+                   const int K,
+                   const int StrideA,
+                   const int StrideB,
+                   const int StrideC,
+                   int kbatch   = 1,
+                   int n_warmup = 1,
+                   int n_iter   = 10)
+    {
+        bool pass = ck::test::profile_gemm_mx_impl<ADataType,
+                                                   BDataType,
+                                                   CDataType,
+                                                   ALayout,
+                                                   BLayout,
+                                                   CLayout,
+                                                   ScaleBlockSize>(verify_,
+                                                                   init_method_,
+                                                                   log_,
+                                                                   bench_,
+                                                                   M,
+                                                                   N,
+                                                                   K,
+                                                                   StrideA,
+                                                                   StrideB,
+                                                                   StrideC,
+                                                                   kbatch,
+                                                                   n_warmup,
+                                                                   n_iter);
+        EXPECT_TRUE(pass);
+    }
+};
+
+} // namespace test
+} // namespace ck
diff --git a/test/gemm_universal/CMakeLists.txt b/test/gemm_universal/CMakeLists.txt
old mode 100644
new mode 100755
index 4aab6323cc..0a68622ebe
--- a/test/gemm_universal/CMakeLists.txt
+++ b/test/gemm_universal/CMakeLists.txt
@@ -1,4 +1,29 @@
-add_gtest_executable(test_gemm_universal test_gemm_universal_xdl.cpp)
+add_gtest_executable(test_gemm_universal_wmma_fp16 test_gemm_universal_wmma_fp16.cpp)
 if(result EQUAL 0)
-   target_link_libraries(test_gemm_universal PRIVATE utility device_gemm_universal_instance)
- endif()
+    target_link_libraries(test_gemm_universal_wmma_fp16 PRIVATE utility device_gemm_universal_instance)
+endif()
+
+add_gtest_executable(test_gemm_universal_wmma_bf16 test_gemm_universal_wmma_bf16.cpp)
+if(result EQUAL 0)
+    target_link_libraries(test_gemm_universal_wmma_bf16 PRIVATE utility device_gemm_universal_instance)
+endif()
+
+add_gtest_executable(test_gemm_universal_wmma_fp8 test_gemm_universal_wmma_fp8.cpp)
+if(result EQUAL 0)
+    target_link_libraries(test_gemm_universal_wmma_fp8 PRIVATE utility device_gemm_universal_instance)
+endif()
+
+add_gtest_executable(test_gemm_universal_xdl_fp16 test_gemm_universal_xdl_fp16.cpp)
+if(result EQUAL 0)
+    target_link_libraries(test_gemm_universal_xdl_fp16 PRIVATE utility device_gemm_universal_instance)
+endif()
+
+add_gtest_executable(test_gemm_universal_xdl_fp8 test_gemm_universal_xdl_fp8.cpp)
+if(result EQUAL 0)
+    target_link_libraries(test_gemm_universal_xdl_fp8 PRIVATE utility device_gemm_universal_instance)
+endif()
+
+add_gtest_executable(test_gemm_universal_xdl_bf16 test_gemm_universal_xdl_bf16.cpp)
+if(result EQUAL 0)
+    target_link_libraries(test_gemm_universal_xdl_bf16 PRIVATE utility device_gemm_universal_instance)
+endif()
diff --git a/test/gemm_universal/test_gemm_universal_ut_cases.inc b/test/gemm_universal/test_gemm_universal_ut_cases_bf16.inc
similarity index 75%
rename from test/gemm_universal/test_gemm_universal_ut_cases.inc
rename to test/gemm_universal/test_gemm_universal_ut_cases_bf16.inc
index 9a21666856..8a6c672a9f 100644
--- a/test/gemm_universal/test_gemm_universal_ut_cases.inc
+++ b/test/gemm_universal/test_gemm_universal_ut_cases_bf16.inc
@@ -1,6 +1,6 @@
 #pragma once
 
-TYPED_TEST(TestGemmUniversal_MK_KN, SmallM)
+TYPED_TEST(TestGemmUniversal_BF16_MK_KN, SmallM)
 {
     std::vector<int> Ms{1, 2, 3, 4, 5, 6};
     constexpr int N = 512;
@@ -14,7 +14,7 @@ TYPED_TEST(TestGemmUniversal_MK_KN, SmallM)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_MK_NK, SmallM)
+TYPED_TEST(TestGemmUniversal_BF16_MK_NK, SmallM)
 {
     std::vector<int> Ms{1, 2, 3, 4, 5, 6};
     constexpr int N = 512;
@@ -28,7 +28,7 @@ TYPED_TEST(TestGemmUniversal_MK_NK, SmallM)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_KM_KN, SmallM)
+TYPED_TEST(TestGemmUniversal_BF16_KM_KN, SmallM)
 {
     std::vector<int> Ms{1, 2, 3, 4, 5, 6};
     constexpr int N = 512;
@@ -44,7 +44,7 @@ TYPED_TEST(TestGemmUniversal_KM_KN, SmallM)
     }
 }
 
-TYPED_TEST(TestGemmUniversal_KM_NK, SmallM)
+TYPED_TEST(TestGemmUniversal_BF16_KM_NK, SmallM)
 {
     std::vector<int> Ms{1, 2, 3, 4, 5, 6};
     constexpr int N = 512;
@@ -60,7 +60,7 @@ TYPED_TEST(TestGemmUniversal_KM_NK, SmallM)
     }
 }
 
-TYPED_TEST(TestGemmUniversal_MK_KN, MidLargeM)
+TYPED_TEST(TestGemmUniversal_BF16_MK_KN, MidLargeM)
 {
     std::vector<int> Ms{127, 255, 312, 799, 1573};
     constexpr int N = 512;
@@ -74,7 +74,7 @@ TYPED_TEST(TestGemmUniversal_MK_KN, MidLargeM)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_MK_NK, MidLargeM)
+TYPED_TEST(TestGemmUniversal_BF16_MK_NK, MidLargeM)
 {
     std::vector<int> Ms{127, 255, 312, 799, 1573};
     constexpr int N = 512;
@@ -88,7 +88,7 @@ TYPED_TEST(TestGemmUniversal_MK_NK, MidLargeM)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_KM_KN, MidLargeM)
+TYPED_TEST(TestGemmUniversal_BF16_KM_KN, MidLargeM)
 {
     std::vector<int> Ms{127, 255, 312, 799, 1573};
     constexpr int N = 512;
@@ -104,7 +104,7 @@ TYPED_TEST(TestGemmUniversal_KM_KN, MidLargeM)
     }
 }
 
-TYPED_TEST(TestGemmUniversal_KM_NK, MidLargeM)
+TYPED_TEST(TestGemmUniversal_BF16_KM_NK, MidLargeM)
 {
     std::vector<int> Ms{127, 255, 312, 799, 1573};
     constexpr int N = 512;
@@ -120,7 +120,7 @@ TYPED_TEST(TestGemmUniversal_KM_NK, MidLargeM)
     }
 }
 
-TYPED_TEST(TestGemmUniversal_MK_KN, PaddK)
+TYPED_TEST(TestGemmUniversal_BF16_MK_KN, PaddK)
 {
     std::vector<int> Ms{127};
     constexpr int N = 512;
@@ -134,7 +134,7 @@ TYPED_TEST(TestGemmUniversal_MK_KN, PaddK)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_MK_NK, PaddK)
+TYPED_TEST(TestGemmUniversal_BF16_MK_NK, PaddK)
 {
     std::vector<int> Ms{127};
     constexpr int N = 512;
@@ -148,7 +148,7 @@ TYPED_TEST(TestGemmUniversal_MK_NK, PaddK)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_KM_KN, PaddK)
+TYPED_TEST(TestGemmUniversal_BF16_KM_KN, PaddK)
 {
     std::vector<int> Ms{127};
     constexpr int N = 512;
@@ -164,7 +164,7 @@ TYPED_TEST(TestGemmUniversal_KM_KN, PaddK)
     }
 }
 
-TYPED_TEST(TestGemmUniversal_KM_NK, PaddK)
+TYPED_TEST(TestGemmUniversal_BF16_KM_NK, PaddK)
 {
     std::vector<int> Ms{127};
     constexpr int N = 512;
@@ -180,7 +180,7 @@ TYPED_TEST(TestGemmUniversal_KM_NK, PaddK)
     }
 }
 
-TYPED_TEST(TestGemmUniversal_MK_KN, Regular)
+TYPED_TEST(TestGemmUniversal_BF16_MK_KN, Regular)
 {
     std::vector<int> Ms{512};
     constexpr int N = 512;
@@ -194,7 +194,7 @@ TYPED_TEST(TestGemmUniversal_MK_KN, Regular)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_MK_NK, Regular)
+TYPED_TEST(TestGemmUniversal_BF16_MK_NK, Regular)
 {
     std::vector<int> Ms{512};
     constexpr int N = 512;
@@ -207,35 +207,3 @@ TYPED_TEST(TestGemmUniversal_MK_NK, Regular)
     for(int M : Ms)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
-
-TYPED_TEST(TestGemmUniversal_KM_KN, Regular)
-{
-    std::vector<int> Ms{512};
-    constexpr int N = 512;
-    constexpr int K = 512;
-
-    constexpr int StrideB = N;
-    constexpr int StrideC = N;
-
-    for(int M : Ms)
-    {
-        int StrideA = M;
-        this->Run(M, N, K, StrideA, StrideB, StrideC);
-    }
-}
-
-TYPED_TEST(TestGemmUniversal_KM_NK, Regular)
-{
-    std::vector<int> Ms{512};
-    constexpr int N = 512;
-    constexpr int K = 512;
-
-    constexpr int StrideB = N;
-    constexpr int StrideC = N;
-
-    for(int M : Ms)
-    {
-        int StrideA = M;
-        this->Run(M, N, K, StrideA, StrideB, StrideC);
-    }
-}
diff --git a/test/gemm_universal/test_gemm_universal_ut_cases_fp16.inc b/test/gemm_universal/test_gemm_universal_ut_cases_fp16.inc
new file mode 100644
index 0000000000..6f6d550625
--- /dev/null
+++ b/test/gemm_universal/test_gemm_universal_ut_cases_fp16.inc
@@ -0,0 +1,113 @@
+#pragma once
+
+TYPED_TEST(TestGemmUniversal_FP16_MK_KN, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP16_MK_NK, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP16_MK_KN, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP16_MK_NK, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP16_MK_KN, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP16_MK_NK, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP16_MK_KN, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP16_MK_NK, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
diff --git a/test/gemm_universal/test_gemm_universal_ut_cases_fp8.inc b/test/gemm_universal/test_gemm_universal_ut_cases_fp8.inc
new file mode 100644
index 0000000000..b831e15e9c
--- /dev/null
+++ b/test/gemm_universal/test_gemm_universal_ut_cases_fp8.inc
@@ -0,0 +1,113 @@
+#pragma once
+
+TYPED_TEST(TestGemmUniversal_FP8_MK_KN, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP8_MK_NK, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP8_MK_KN, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP8_MK_NK, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP8_MK_KN, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP8_MK_NK, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP8_MK_KN, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP8_MK_NK, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
diff --git a/test/gemm_universal/test_gemm_universal_wmma_bf16.cpp b/test/gemm_universal/test_gemm_universal_wmma_bf16.cpp
new file mode 100644
index 0000000000..22376a8599
--- /dev/null
+++ b/test/gemm_universal/test_gemm_universal_wmma_bf16.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "test_gemm_universal_util.hpp"
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+namespace {
+
+template <typename X, typename Y>
+struct tuple_concat;
+
+template <typename... Xs, typename... Ys>
+struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
+{
+    using type = std::tuple<Xs..., Ys...>;
+};
+
+} // namespace
+
+template <typename Tuple>
+class TestGemmUniversal_BF16_MK_KN
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_BF16_MK_NK
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_BF16_KM_KN
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Col, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_BF16_KM_NK
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Col, Col>, Tuple>::type>
+{
+};
+
+// clang-format off
+using KernelTypes_MK_KN = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    std::tuple<     BF16,      BF16,            BF16,      BF16>
+    >;
+
+using KernelTypes_MK_NK = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    std::tuple<     BF16,      BF16,            BF16,      BF16>
+    >;
+
+using KernelTypes_KM_KN = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    std::tuple<     BF16,      BF16,            BF16,      BF16>
+    >;
+
+using KernelTypes_KM_NK = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    std::tuple<     BF16,      BF16,            BF16,      BF16>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestGemmUniversal_BF16_MK_KN, KernelTypes_MK_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_BF16_MK_NK, KernelTypes_MK_NK);
+TYPED_TEST_SUITE(TestGemmUniversal_BF16_KM_KN, KernelTypes_KM_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_BF16_KM_NK, KernelTypes_KM_NK);
+
+#include "test_gemm_universal_ut_cases_bf16.inc"
diff --git a/test/gemm_universal/test_gemm_universal_wmma_fp16.cpp b/test/gemm_universal/test_gemm_universal_wmma_fp16.cpp
new file mode 100644
index 0000000000..1adee41ed2
--- /dev/null
+++ b/test/gemm_universal/test_gemm_universal_wmma_fp16.cpp
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "test_gemm_universal_util.hpp"
+
+using F16 = ck::half_t;
+
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+namespace {
+
+template <typename X, typename Y>
+struct tuple_concat;
+
+template <typename... Xs, typename... Ys>
+struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
+{
+    using type = std::tuple<Xs..., Ys...>;
+};
+
+} // namespace
+
+template <typename Tuple>
+class TestGemmUniversal_FP16_MK_KN
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_FP16_MK_NK
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
+{
+};
+
+// clang-format off
+using KernelTypes_MK_KN = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    std::tuple<      F16,       F16,             F16,       F16>
+    >;
+
+using KernelTypes_MK_NK = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    std::tuple<      F16,       F16,             F16,       F16>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestGemmUniversal_FP16_MK_KN, KernelTypes_MK_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_FP16_MK_NK, KernelTypes_MK_NK);
+
+#include "test_gemm_universal_ut_cases_fp16.inc"
diff --git a/test/gemm_universal/test_gemm_universal_wmma_fp8.cpp b/test/gemm_universal/test_gemm_universal_wmma_fp8.cpp
new file mode 100644
index 0000000000..3579424496
--- /dev/null
+++ b/test/gemm_universal/test_gemm_universal_wmma_fp8.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "test_gemm_universal_util.hpp"
+
+#if CK_USE_WMMA_FP8
+
+using F8   = ck::f8_t;
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+namespace {
+
+template <typename X, typename Y>
+struct tuple_concat;
+
+template <typename... Xs, typename... Ys>
+struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
+{
+    using type = std::tuple<Xs..., Ys...>;
+};
+
+} // namespace
+
+template <typename Tuple>
+class TestGemmUniversal_FP8_MK_KN
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_FP8_MK_NK
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
+{
+};
+
+// clang-format off
+using KernelTypes_MK_KN = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    std::tuple<       F8,        F8,              F8,      BF16>
+    >;
+
+using KernelTypes_MK_NK = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    std::tuple<       F8,        F8,              F8,      BF16>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestGemmUniversal_FP8_MK_KN, KernelTypes_MK_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_FP8_MK_NK, KernelTypes_MK_NK);
+
+#include "test_gemm_universal_ut_cases_fp8.inc"
+
+#endif // CK_USE_WMMA_FP8
diff --git a/test/gemm_universal/test_gemm_universal_xdl.cpp b/test/gemm_universal/test_gemm_universal_xdl_bf16.cpp
similarity index 61%
rename from test/gemm_universal/test_gemm_universal_xdl.cpp
rename to test/gemm_universal/test_gemm_universal_xdl_bf16.cpp
index b872d7089a..8fde65657a 100644
--- a/test/gemm_universal/test_gemm_universal_xdl.cpp
+++ b/test/gemm_universal/test_gemm_universal_xdl_bf16.cpp
@@ -7,8 +7,6 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_util.hpp"
 
-using F8   = ck::f8_t;
-using F16  = ck::half_t;
 using BF16 = ck::bhalf_t;
 using F32  = float;
 
@@ -29,25 +27,25 @@ struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
 } // namespace
 
 template <typename Tuple>
-class TestGemmUniversal_MK_KN
+class TestGemmUniversal_BF16_MK_KN
     : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
 {
 };
 
 template <typename Tuple>
-class TestGemmUniversal_MK_NK
+class TestGemmUniversal_BF16_MK_NK
     : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
 {
 };
 
 template <typename Tuple>
-class TestGemmUniversal_KM_KN
+class TestGemmUniversal_BF16_KM_KN
     : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Col, Row>, Tuple>::type>
 {
 };
 
 template <typename Tuple>
-class TestGemmUniversal_KM_NK
+class TestGemmUniversal_BF16_KM_NK
     : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Col, Col>, Tuple>::type>
 {
 };
@@ -55,22 +53,12 @@ class TestGemmUniversal_KM_NK
 // clang-format off
 using KernelTypes_MK_KN = ::testing::Types<
     //         ADataType, BDataType, ComputeDataType, CDataType
-    std::tuple<      F16,       F16,             F16,     F16>,
-#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
-    std::tuple<      F16,        F8,             F16,     F16>,
-    std::tuple<       F8,       F16,             F16,     F16>,
-    std::tuple<       F8,        F8,              F8,    BF16>,
-#endif
+
     std::tuple<     BF16,      BF16,            BF16,    BF16>
     >;
 using KernelTypes_MK_NK = ::testing::Types<
     //         ADataType, BDataType, ComputeDataType, CDataType
-    std::tuple<      F16,       F16,             F16,     F16>,
-#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
-    std::tuple<      F16,        F8,             F16,     F16>,
-    std::tuple<       F8,       F16,             F16,     F16>,
-    std::tuple<       F8,        F8,              F8,    BF16>,
-#endif
+
     std::tuple<     BF16,      BF16,            BF16,    BF16>
     >;
 
@@ -86,9 +74,9 @@ using KernelTypes_KM_KN = ::testing::Types<
 
 // clang-format on
 
-TYPED_TEST_SUITE(TestGemmUniversal_MK_KN, KernelTypes_MK_KN);
-TYPED_TEST_SUITE(TestGemmUniversal_MK_NK, KernelTypes_MK_NK);
-TYPED_TEST_SUITE(TestGemmUniversal_KM_KN, KernelTypes_KM_KN);
-TYPED_TEST_SUITE(TestGemmUniversal_KM_NK, KernelTypes_KM_NK);
+TYPED_TEST_SUITE(TestGemmUniversal_BF16_MK_KN, KernelTypes_MK_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_BF16_MK_NK, KernelTypes_MK_NK);
+TYPED_TEST_SUITE(TestGemmUniversal_BF16_KM_KN, KernelTypes_KM_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_BF16_KM_NK, KernelTypes_KM_NK);
 
-#include "test_gemm_universal_ut_cases.inc"
+#include "test_gemm_universal_ut_cases_bf16.inc"
diff --git a/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp b/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp
new file mode 100644
index 0000000000..24f587daf6
--- /dev/null
+++ b/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "test_gemm_universal_util.hpp"
+
+using F8  = ck::f8_t;
+using F16 = ck::half_t;
+
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+namespace {
+
+template <typename X, typename Y>
+struct tuple_concat;
+
+template <typename... Xs, typename... Ys>
+struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
+{
+    using type = std::tuple<Xs..., Ys...>;
+};
+
+} // namespace
+
+template <typename Tuple>
+class TestGemmUniversal_FP16_MK_KN
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_FP16_MK_NK
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_FP16_KM_KN
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Col, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_FP16_KM_NK
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Col, Col>, Tuple>::type>
+{
+};
+
+// clang-format off
+using KernelTypes_MK_KN = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    
+#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
+    std::tuple<      F16,        F8,             F16,     F16>,
+    std::tuple<       F8,       F16,             F16,     F16>,
+
+#endif
+    std::tuple<      F16,       F16,             F16,     F16>
+    >;
+using KernelTypes_MK_NK = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    
+#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
+    std::tuple<      F16,        F8,             F16,     F16>,
+    std::tuple<       F8,       F16,             F16,     F16>,
+
+#endif
+    std::tuple<      F16,       F16,             F16,     F16>
+    >;
+
+// clang-format on
+
+TYPED_TEST_SUITE(TestGemmUniversal_FP16_MK_KN, KernelTypes_MK_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_FP16_MK_NK, KernelTypes_MK_NK);
+
+#include "test_gemm_universal_ut_cases_fp16.inc"
diff --git a/test/gemm_universal/test_gemm_universal_xdl_fp8.cpp b/test/gemm_universal/test_gemm_universal_xdl_fp8.cpp
new file mode 100644
index 0000000000..e833ab7825
--- /dev/null
+++ b/test/gemm_universal/test_gemm_universal_xdl_fp8.cpp
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "test_gemm_universal_util.hpp"
+
+using F8   = ck::f8_t;
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+namespace {
+
+template <typename X, typename Y>
+struct tuple_concat;
+
+template <typename... Xs, typename... Ys>
+struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
+{
+    using type = std::tuple<Xs..., Ys...>;
+};
+
+} // namespace
+
+template <typename Tuple>
+class TestGemmUniversal_FP8_MK_KN
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_FP8_MK_NK
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
+{
+};
+
+// clang-format off
+using KernelTypes_MK_KN = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
+    std::tuple<      F16,        F8,             F16,     F16>,
+    std::tuple<       F8,       F16,             F16,     F16>,
+    std::tuple<       F8,        F8,              F8,    BF16>,
+#endif
+    // Fallback test type when FP8 is not enabled
+    std::tuple<      F16,       F16,             F16,     F16>
+    >;
+using KernelTypes_MK_NK = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+
+#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
+    std::tuple<      F16,        F8,             F16,     F16>,
+    std::tuple<       F8,       F16,             F16,     F16>,
+    std::tuple<       F8,        F8,              F8,    BF16>,
+#endif
+    // Fallback test type when FP8 is not enabled
+    std::tuple<      F16,       F16,             F16,     F16>
+    >;
+
+
+TYPED_TEST_SUITE(TestGemmUniversal_FP8_MK_KN, KernelTypes_MK_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_FP8_MK_NK, KernelTypes_MK_NK);
+
+
+#include "test_gemm_universal_ut_cases_fp8.inc"
diff --git a/test/gemm_universal_streamk/CMakeLists.txt b/test/gemm_universal_streamk/CMakeLists.txt
new file mode 100755
index 0000000000..6e42bfe396
--- /dev/null
+++ b/test/gemm_universal_streamk/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_gtest_executable(test_gemm_universal_streamk_fp16 test_gemm_universal_streamk_xdl_fp16.cpp)
+if(result EQUAL 0)
+   target_link_libraries(test_gemm_universal_streamk_fp16 PRIVATE utility device_gemm_universal_streamk_instance)
+endif()
+
+add_gtest_executable(test_gemm_universal_streamk_fp8 test_gemm_universal_streamk_xdl_fp8.cpp)
+if(result EQUAL 0)
+   target_link_libraries(test_gemm_universal_streamk_fp8 PRIVATE utility device_gemm_universal_streamk_instance)
+endif()
+
+add_gtest_executable(test_gemm_universal_streamk_bf16 test_gemm_universal_streamk_xdl_bf16.cpp)
+if(result EQUAL 0)
+   target_link_libraries(test_gemm_universal_streamk_bf16 PRIVATE utility device_gemm_universal_streamk_instance)
+endif()
+
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_bf16.inc b/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_bf16.inc
new file mode 100644
index 0000000000..22977866b5
--- /dev/null
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_bf16.inc
@@ -0,0 +1,149 @@
+#pragma once
+
+TYPED_TEST(TestGemmUniversal_Streamk_BF16_MK_KN, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_BF16_MK_NK, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_BF16_KM_KN, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+    {
+        int StrideA = M;
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+    }
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_BF16_KM_KN, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+    {
+        int StrideA = M;
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+    }
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_BF16_MK_KN, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_BF16_MK_NK, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_BF16_KM_KN, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+    {
+        int StrideA = M;
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+    }
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_BF16_KM_NK, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+    {
+        int StrideA = M;
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+    }
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_BF16_MK_KN, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_BF16_MK_NK, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp16.inc b/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp16.inc
new file mode 100644
index 0000000000..99c8e6d163
--- /dev/null
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp16.inc
@@ -0,0 +1,85 @@
+#pragma once
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP16_MK_KN, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP16_MK_NK, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP16_MK_KN, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP16_MK_NK, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP16_MK_KN, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP16_MK_NK, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp8.inc b/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp8.inc
new file mode 100755
index 0000000000..b98ee92800
--- /dev/null
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp8.inc
@@ -0,0 +1,85 @@
+#pragma once
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP8_MK_KN, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP8_MK_NK, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP8_MK_KN, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP8_MK_NK, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP8_MK_KN, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP8_MK_NK, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp b/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp
new file mode 100644
index 0000000000..805587a274
--- /dev/null
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+#include <sstream>
+#include <tuple>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "include/ck/utility/data_type.hpp"
+#include "profiler/profile_gemm_universal_streamk_impl.hpp"
+
+namespace ck {
+namespace test {
+
+template <typename Tuple>
+class TestGemmUniversal_Streamk : public testing::Test
+{
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using F32 = float;
+
+    protected:
+    using ALayout         = std::tuple_element_t<0, Tuple>;
+    using BLayout         = std::tuple_element_t<1, Tuple>;
+    using CLayout         = Row;
+    using ADataType       = std::tuple_element_t<2, Tuple>;
+    using BDataType       = std::tuple_element_t<3, Tuple>;
+    using ComputeDataType = std::tuple_element_t<4, Tuple>;
+    using CDataType       = std::tuple_element_t<5, Tuple>;
+
+    public:
+    static constexpr bool verify_     = true;
+    static constexpr int init_method_ = 1; // decimal value initialization
+    static constexpr bool log_        = false;
+    static constexpr bool bench_      = false; // measure kernel performance
+
+    std::vector<int> grid_size_list;
+    std::vector<int> streamk_sel_list;
+
+    void SetUp() override
+    {
+        streamk_sel_list = {0, 1, 2}; // 0: Data Parallel (DP) mode (Stream-K OFF), 1: 1-tile
+                                      // Stream-K+ DP, // {0, 1, 2, 3, 4}
+        // 2:2-tile Stream-K + DP
+    }
+
+    void Run(const int M,
+             const int N,
+             const int K,
+             const int StrideA,
+             const int StrideB,
+             const int StrideC)
+    {
+        for(auto streamk_sel : streamk_sel_list)
+        {
+            RunSingle(M, N, K, StrideA, StrideB, StrideC, streamk_sel, -1);
+        }
+    }
+
+    void RunSingle(const int M,
+                   const int N,
+                   const int K,
+                   const int StrideA,
+                   const int StrideB,
+                   const int StrideC,
+                   int streamk_sel,
+                   int Grid_size,
+                   int n_warmup = 1,
+                   int n_iter   = 10)
+    {
+        bool pass = ck::profiler::profile_gemm_universal_streamk_impl<ADataType,
+                                                                      BDataType,
+                                                                      ComputeDataType,
+                                                                      F32,
+                                                                      CDataType,
+                                                                      ALayout,
+                                                                      BLayout,
+                                                                      CLayout>(verify_,
+                                                                               init_method_,
+                                                                               log_,
+                                                                               bench_,
+                                                                               M,
+                                                                               N,
+                                                                               K,
+                                                                               StrideA,
+                                                                               StrideB,
+                                                                               StrideC,
+                                                                               streamk_sel,
+                                                                               Grid_size,
+                                                                               n_warmup,
+                                                                               n_iter);
+        EXPECT_TRUE(pass);
+    }
+};
+
+} // namespace test
+} // namespace ck
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_bf16.cpp b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_bf16.cpp
new file mode 100755
index 0000000000..1aef74cf18
--- /dev/null
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_bf16.cpp
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "test_gemm_universal_streamk_util.hpp"
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+namespace {
+
+template <typename X, typename Y>
+struct tuple_concat;
+
+template <typename... Xs, typename... Ys>
+struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
+{
+    using type = std::tuple<Xs..., Ys...>;
+};
+
+} // namespace
+
+template <typename Tuple>
+class TestGemmUniversal_Streamk_BF16_MK_KN
+    : public ck::test::TestGemmUniversal_Streamk<
+          typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_Streamk_BF16_MK_NK
+    : public ck::test::TestGemmUniversal_Streamk<
+          typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_Streamk_BF16_KM_KN
+    : public ck::test::TestGemmUniversal_Streamk<
+          typename tuple_concat<std::tuple<Col, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_Streamk_BF16_KM_NK
+    : public ck::test::TestGemmUniversal_Streamk<
+          typename tuple_concat<std::tuple<Col, Col>, Tuple>::type>
+{
+};
+
+// clang-format off
+using KernelTypes_MK_KN = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    std::tuple<     BF16,      BF16,            BF16,    BF16>
+    >;
+using KernelTypes_MK_NK = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+
+    std::tuple<     BF16,      BF16,            BF16,    BF16>
+    >;
+
+using KernelTypes_KM_NK = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    std::tuple<     BF16,      BF16,            BF16,    BF16>
+    >;
+
+using KernelTypes_KM_KN = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    std::tuple<     BF16,      BF16,            BF16,    BF16>
+    >;
+
+// clang-format on
+
+TYPED_TEST_SUITE(TestGemmUniversal_Streamk_BF16_MK_KN, KernelTypes_MK_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_Streamk_BF16_MK_NK, KernelTypes_MK_NK);
+TYPED_TEST_SUITE(TestGemmUniversal_Streamk_BF16_KM_KN, KernelTypes_KM_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_Streamk_BF16_KM_NK, KernelTypes_KM_NK);
+
+#include "test_gemm_universal_streamk_ut_cases_bf16.inc"
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp16.cpp b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp16.cpp
new file mode 100644
index 0000000000..43b122ff0d
--- /dev/null
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp16.cpp
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "test_gemm_universal_streamk_util.hpp"
+
+using F8  = ck::f8_t;
+using F16 = ck::half_t;
+
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+namespace {
+
+template <typename X, typename Y>
+struct tuple_concat;
+
+template <typename... Xs, typename... Ys>
+struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
+{
+    using type = std::tuple<Xs..., Ys...>;
+};
+
+} // namespace
+
+template <typename Tuple>
+class TestGemmUniversal_Streamk_FP16_MK_KN
+    : public ck::test::TestGemmUniversal_Streamk<
+          typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_Streamk_FP16_MK_NK
+    : public ck::test::TestGemmUniversal_Streamk<
+          typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_Streamk_FP16_KM_KN
+    : public ck::test::TestGemmUniversal_Streamk<
+          typename tuple_concat<std::tuple<Col, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_Streamk_FP16_KM_NK
+    : public ck::test::TestGemmUniversal_Streamk<
+          typename tuple_concat<std::tuple<Col, Col>, Tuple>::type>
+{
+};
+
+// clang-format off
+using KernelTypes_MK_KN = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
+    std::tuple<      F16,        F8,             F16,     F16>,
+    std::tuple<       F8,       F16,             F16,     F16>,
+#endif
+
+    std::tuple<      F16,       F16,             F16,     F16>
+    >;
+using KernelTypes_MK_NK = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    
+#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
+    std::tuple<      F16,        F8,             F16,     F16>,
+    std::tuple<       F8,       F16,             F16,     F16>,
+#endif
+    std::tuple<      F16,       F16,             F16,     F16>
+    >;
+
+// clang-format on
+
+TYPED_TEST_SUITE(TestGemmUniversal_Streamk_FP16_MK_KN, KernelTypes_MK_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_Streamk_FP16_MK_NK, KernelTypes_MK_NK);
+
+#include "test_gemm_universal_streamk_ut_cases_fp16.inc"
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp8.cpp b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp8.cpp
new file mode 100755
index 0000000000..3836de056c
--- /dev/null
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp8.cpp
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "test_gemm_universal_streamk_util.hpp"
+
+using F8   = ck::f8_t;
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+namespace {
+
+template <typename X, typename Y>
+struct tuple_concat;
+
+template <typename... Xs, typename... Ys>
+struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
+{
+    using type = std::tuple<Xs..., Ys...>;
+};
+
+} // namespace
+
+template <typename Tuple>
+class TestGemmUniversal_Streamk_FP8_MK_KN
+    : public ck::test::TestGemmUniversal_Streamk<
+          typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_Streamk_FP8_MK_NK
+    : public ck::test::TestGemmUniversal_Streamk<
+          typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
+{
+};
+
+// clang-format off
+using KernelTypes_MK_KN = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+
+#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
+    std::tuple<      F16,        F8,             F16,     F16>,
+    std::tuple<       F8,       F16,             F16,     F16>,
+    std::tuple<       F8,        F8,              F8,    BF16>,
+#endif
+    // Fallback test type when FP8 is not enabled
+    std::tuple<      F16,       F16,             F16,     F16>
+    >;
+using KernelTypes_MK_NK = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+
+#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
+    std::tuple<      F16,        F8,             F16,     F16>,
+    std::tuple<       F8,       F16,             F16,     F16>,
+    std::tuple<       F8,        F8,              F8,    BF16>,
+#endif
+    // Fallback test type when FP8 is not enabled
+    std::tuple<      F16,       F16,             F16,     F16>
+    >;
+
+// clang-format on
+
+TYPED_TEST_SUITE(TestGemmUniversal_Streamk_FP8_MK_KN, KernelTypes_MK_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_Streamk_FP8_MK_NK, KernelTypes_MK_NK);
+
+#include "test_gemm_universal_streamk_ut_cases_fp8.inc"
diff --git a/test/grouped_convnd_bwd_data/CMakeLists.txt b/test/grouped_convnd_bwd_data/CMakeLists.txt
index 6d78da8db7..5c816da416 100644
--- a/test/grouped_convnd_bwd_data/CMakeLists.txt
+++ b/test/grouped_convnd_bwd_data/CMakeLists.txt
@@ -2,6 +2,11 @@ add_gtest_executable(test_grouped_convnd_bwd_data_xdl test_grouped_convnd_bwd_da
 if(result EQUAL 0)
     target_link_libraries(test_grouped_convnd_bwd_data_xdl PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance)
 endif()
+if(GPU_TARGETS MATCHES "gfx9")
+    add_executable(test_grouped_convnd_bwd_data_xdl_large_cases test_grouped_convnd_bwd_data_xdl_large_cases.cpp)
+    target_compile_options(test_grouped_convnd_bwd_data_xdl_large_cases PRIVATE -Wno-global-constructors -Wno-undef)
+    target_link_libraries(test_grouped_convnd_bwd_data_xdl_large_cases PRIVATE gtest_main getopt::getopt utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance)
+endif()
 add_gtest_executable(test_grouped_convnd_bwd_data_wmma test_grouped_convnd_bwd_data_wmma.cpp)
 if(result EQUAL 0)
     target_link_libraries(test_grouped_convnd_bwd_data_wmma PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance)
diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
index fdc8fb64e5..c4404b95ba 100644
--- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
@@ -21,26 +21,31 @@ class TestGroupedConvndBwdDataXdl : public ::testing::Test
     using InLayout  = std::tuple_element_t<3, Tuple>;
 
     std::vector<ck::utils::conv::ConvParam> conv_params;
+    std::vector<ck::index_t> split_ks{1, 2};
 
     template <ck::index_t NDimSpatial>
     void Run()
     {
         EXPECT_FALSE(conv_params.empty());
         bool pass = true;
-        for(auto& param : conv_params)
+        for(auto split_k : split_ks)
         {
-            pass = pass && ck::profiler::profile_grouped_conv_bwd_data_impl<NDimSpatial,
-                                                                            OutLayout,
-                                                                            WeiLayout,
-                                                                            InLayout,
-                                                                            DataType,
-                                                                            DataType,
-                                                                            DataType>(
-                               true,  // do_verification
-                               1,     // init_method: integer value
-                               false, // do_log
-                               false, // time_kernel
-                               param);
+            for(auto& param : conv_params)
+            {
+                pass = pass && ck::profiler::profile_grouped_conv_bwd_data_impl<NDimSpatial,
+                                                                                OutLayout,
+                                                                                WeiLayout,
+                                                                                InLayout,
+                                                                                DataType,
+                                                                                DataType,
+                                                                                DataType>(
+                                   true,  // do_verification
+                                   1,     // init_method: integer value
+                                   false, // do_log
+                                   false, // time_kernel
+                                   param,
+                                   split_k);
+            }
         }
         EXPECT_TRUE(pass);
     }
@@ -51,6 +56,12 @@ using namespace ck::tensor_layout::convolution;
 using KernelTypes2d = ::testing::Types<std::tuple<float, GNHWK, GKYXC, GNHWC>,
                                        std::tuple<ck::half_t, GNHWK, GKYXC, GNHWC>,
                                        std::tuple<ck::bhalf_t, GNHWK, GKYXC, GNHWC>,
+                                       std::tuple<float, NGKHW, GKYXC, NGCHW>,
+                                       std::tuple<ck::half_t, NGKHW, GKYXC, NGCHW>,
+                                       std::tuple<ck::bhalf_t, NGKHW, GKYXC, NGCHW>,
+                                       std::tuple<float, NGKHW, GKCYX, NGCHW>,
+                                       std::tuple<ck::half_t, NGKHW, GKCYX, NGCHW>,
+                                       std::tuple<ck::bhalf_t, NGKHW, GKCYX, NGCHW>,
                                        std::tuple<float, NHWGK, GKYXC, NHWGC>,
                                        std::tuple<ck::half_t, NHWGK, GKYXC, NHWGC>,
                                        std::tuple<ck::bhalf_t, NHWGK, GKYXC, NHWGC>>;
@@ -58,6 +69,12 @@ using KernelTypes2d = ::testing::Types<std::tuple<float, GNHWK, GKYXC, GNHWC>,
 using KernelTypes3d = ::testing::Types<std::tuple<float, GNDHWK, GKZYXC, GNDHWC>,
                                        std::tuple<ck::half_t, GNDHWK, GKZYXC, GNDHWC>,
                                        std::tuple<ck::bhalf_t, GNDHWK, GKZYXC, GNDHWC>,
+                                       std::tuple<float, NGKDHW, GKZYXC, NGCDHW>,
+                                       std::tuple<ck::half_t, NGKDHW, GKZYXC, NGCDHW>,
+                                       std::tuple<ck::bhalf_t, NGKDHW, GKZYXC, NGCDHW>,
+                                       std::tuple<float, NGKDHW, GKCZYX, NGCDHW>,
+                                       std::tuple<ck::half_t, NGKDHW, GKCZYX, NGCDHW>,
+                                       std::tuple<ck::bhalf_t, NGKDHW, GKCZYX, NGCDHW>,
                                        std::tuple<float, NDHWGK, GKZYXC, NDHWGC>,
                                        std::tuple<ck::half_t, NDHWGK, GKZYXC, NDHWGC>,
                                        std::tuple<ck::bhalf_t, NDHWGK, GKZYXC, NDHWGC>>;
@@ -80,19 +97,16 @@ TYPED_TEST(TestGroupedConvndBwdDataXdl2d, Test2D)
     this->conv_params.clear();
 
     this->conv_params.push_back(
-        {2, 2, 4, 192, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+        {2, 2, 2, 192, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
     this->conv_params.push_back(
-        {2, 2, 128, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+        {2, 2, 2, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
     this->conv_params.push_back(
-        {2, 2, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
+        {2, 2, 2, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
     this->conv_params.push_back(
-        {2, 2, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
-    this->conv_params.push_back({2, 1, 1, 1, 32, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
-    this->conv_params.push_back({2, 1, 1, 64, 3, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
-    this->conv_params.push_back({2, 1, 1, 1, 1, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
-    // SplitN case
-    this->conv_params.push_back(
-        {2, 1, 128, 4, 192, {2, 2}, {224, 224}, {224, 224}, {1, 1}, {0, 0}, {0, 0}});
+        {2, 2, 2, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back({2, 1, 1, 1, 32, {8, 8}, {16, 16}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back({2, 1, 1, 64, 3, {8, 8}, {16, 16}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back({2, 1, 1, 1, 1, {8, 8}, {16, 16}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
     this->template Run<2>();
 }
 
@@ -100,28 +114,16 @@ TYPED_TEST(TestGroupedConvndBwdDataXdl3d, Test3D)
 {
     this->conv_params.clear();
     this->conv_params.push_back(
-        {3, 2, 16, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+        {3, 2, 2, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
     this->conv_params.push_back(
         {3, 2, 2, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->conv_params.push_back(
-        {3, 2, 32, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+        {3, 2, 2, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
     this->conv_params.push_back(
-        {3, 1, 1, 1, 32, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+        {3, 1, 1, 1, 32, {3, 3, 3}, {4, 16, 16}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->conv_params.push_back(
-        {3, 1, 1, 64, 3, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+        {3, 1, 1, 64, 3, {3, 3, 3}, {4, 16, 16}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->conv_params.push_back(
-        {3, 1, 1, 1, 1, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
-    // SplitN case
-    this->conv_params.push_back({3,
-                                 1,
-                                 128,
-                                 4,
-                                 192,
-                                 {2, 2, 2},
-                                 {2, 224, 224},
-                                 {1, 224, 224},
-                                 {1, 1, 1},
-                                 {0, 0, 0},
-                                 {0, 0, 0}});
+        {3, 1, 1, 1, 1, {3, 3, 3}, {4, 16, 16}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->template Run<3>();
 }
diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl_large_cases.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl_large_cases.cpp
new file mode 100644
index 0000000000..73d793cc5f
--- /dev/null
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl_large_cases.cpp
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <tuple>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "profiler/profile_grouped_conv_bwd_data_impl.hpp"
+
+template <typename Tuple>
+class TestGroupedConvndBwdDataXdl : public ::testing::Test
+{
+    protected:
+    using DataType  = std::tuple_element_t<0, Tuple>;
+    using OutLayout = std::tuple_element_t<1, Tuple>;
+    using WeiLayout = std::tuple_element_t<2, Tuple>;
+    using InLayout  = std::tuple_element_t<3, Tuple>;
+
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+    std::vector<ck::index_t> split_ks{1, 2};
+
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty());
+        bool pass = true;
+        for(auto split_k : split_ks)
+        {
+            for(auto& param : conv_params)
+            {
+                pass = pass && ck::profiler::profile_grouped_conv_bwd_data_impl<NDimSpatial,
+                                                                                OutLayout,
+                                                                                WeiLayout,
+                                                                                InLayout,
+                                                                                DataType,
+                                                                                DataType,
+                                                                                DataType>(
+                                   true,  // do_verification
+                                   1,     // init_method: integer value
+                                   false, // do_log
+                                   false, // time_kernel
+                                   param,
+                                   split_k);
+            }
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+
+using namespace ck::tensor_layout::convolution;
+
+using KernelTypes2d = ::testing::Types<std::tuple<float, GNHWK, GKYXC, GNHWC>,
+                                       std::tuple<ck::half_t, GNHWK, GKYXC, GNHWC>,
+                                       std::tuple<ck::bhalf_t, GNHWK, GKYXC, GNHWC>,
+                                       std::tuple<float, NGKHW, GKYXC, NGCHW>,
+                                       std::tuple<ck::half_t, NGKHW, GKYXC, NGCHW>,
+                                       std::tuple<ck::bhalf_t, NGKHW, GKYXC, NGCHW>,
+                                       std::tuple<float, NGKHW, GKCYX, NGCHW>,
+                                       std::tuple<ck::half_t, NGKHW, GKCYX, NGCHW>,
+                                       std::tuple<ck::bhalf_t, NGKHW, GKCYX, NGCHW>,
+                                       std::tuple<float, NHWGK, GKYXC, NHWGC>,
+                                       std::tuple<ck::half_t, NHWGK, GKYXC, NHWGC>,
+                                       std::tuple<ck::bhalf_t, NHWGK, GKYXC, NHWGC>>;
+
+using KernelTypes3d = ::testing::Types<std::tuple<float, GNDHWK, GKZYXC, GNDHWC>,
+                                       std::tuple<ck::half_t, GNDHWK, GKZYXC, GNDHWC>,
+                                       std::tuple<ck::bhalf_t, GNDHWK, GKZYXC, GNDHWC>,
+                                       std::tuple<float, NGKDHW, GKZYXC, NGCDHW>,
+                                       std::tuple<ck::half_t, NGKDHW, GKZYXC, NGCDHW>,
+                                       std::tuple<ck::bhalf_t, NGKDHW, GKZYXC, NGCDHW>,
+                                       std::tuple<float, NGKDHW, GKCZYX, NGCDHW>,
+                                       std::tuple<ck::half_t, NGKDHW, GKCZYX, NGCDHW>,
+                                       std::tuple<ck::bhalf_t, NGKDHW, GKCZYX, NGCDHW>,
+                                       std::tuple<float, NDHWGK, GKZYXC, NDHWGC>,
+                                       std::tuple<ck::half_t, NDHWGK, GKZYXC, NDHWGC>,
+                                       std::tuple<ck::bhalf_t, NDHWGK, GKZYXC, NDHWGC>>;
+
+template <typename Tuple>
+class TestGroupedConvndBwdDataXdl2d : public TestGroupedConvndBwdDataXdl<Tuple>
+{
+};
+
+template <typename Tuple>
+class TestGroupedConvndBwdDataXdl3d : public TestGroupedConvndBwdDataXdl<Tuple>
+{
+};
+
+TYPED_TEST_SUITE(TestGroupedConvndBwdDataXdl2d, KernelTypes2d);
+TYPED_TEST_SUITE(TestGroupedConvndBwdDataXdl3d, KernelTypes3d);
+
+TYPED_TEST(TestGroupedConvndBwdDataXdl2d, Test2D)
+{
+    this->conv_params.clear();
+    // SplitN case
+    this->conv_params.push_back(
+        {2, 1, 128, 4, 192, {2, 2}, {224, 224}, {224, 224}, {1, 1}, {0, 0}, {0, 0}});
+    this->template Run<2>();
+}
+
+TYPED_TEST(TestGroupedConvndBwdDataXdl3d, Test3D)
+{
+    this->conv_params.clear();
+    // SplitN case
+    this->conv_params.push_back({3,
+                                 1,
+                                 128,
+                                 4,
+                                 192,
+                                 {2, 2, 2},
+                                 {2, 224, 224},
+                                 {1, 224, 224},
+                                 {1, 1, 1},
+                                 {0, 0, 0},
+                                 {0, 0, 0}});
+    this->template Run<3>();
+}
diff --git a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
index 54b96d775c..21f2cb5ce6 100644
--- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
+++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
@@ -146,8 +146,12 @@ using KernelTypes2d = ::testing::Types<
     std::tuple<float, float, float, NHWGC, GKYXC, NHWGK, ck::Number<2>>,
     std::tuple<ck::half_t, ck::half_t, ck::half_t, NHWGC, GKYXC, NHWGK, ck::Number<2>>,
     std::tuple<ck::bhalf_t, float, ck::bhalf_t, NHWGC, GKYXC, NHWGK, ck::Number<2>>,
+    std::tuple<float, float, float, NGCHW, GKYXC, NGKHW, ck::Number<2>>,
     std::tuple<ck::bhalf_t, ck::bhalf_t, ck::bhalf_t, NGCHW, GKYXC, NGKHW, ck::Number<2>>,
-    std::tuple<ck::half_t, ck::half_t, ck::half_t, NGCHW, GKYXC, NGKHW, ck::Number<2>>>;
+    std::tuple<ck::half_t, ck::half_t, ck::half_t, NGCHW, GKYXC, NGKHW, ck::Number<2>>,
+    std::tuple<float, float, float, NGCHW, GKCYX, NGKHW, ck::Number<2>>,
+    std::tuple<ck::bhalf_t, ck::bhalf_t, ck::bhalf_t, NGCHW, GKCYX, NGKHW, ck::Number<2>>,
+    std::tuple<ck::half_t, ck::half_t, ck::half_t, NGCHW, GKCYX, NGKHW, ck::Number<2>>>;
 using KernelTypes3d = ::testing::Types<
     std::tuple<float, float, float, GNDHWC, GKZYXC, GNDHWK, ck::Number<3>>,
     std::tuple<ck::half_t, ck::half_t, ck::half_t, GNDHWC, GKZYXC, GNDHWK, ck::Number<3>>,
@@ -158,8 +162,12 @@ using KernelTypes3d = ::testing::Types<
     std::tuple<ck::half_t, ck::half_t, ck::half_t, NDHWGC, GKZYXC, NDHWGK, ck::Number<3>>,
     std::tuple<ck::bhalf_t, float, ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK, ck::Number<3>>,
     std::tuple<int8_t, int8_t, int8_t, NDHWGC, GKZYXC, NDHWGK, ck::Number<3>>,
+    std::tuple<float, float, float, NGCDHW, GKZYXC, NGKDHW, ck::Number<3>>,
     std::tuple<ck::bhalf_t, ck::bhalf_t, ck::bhalf_t, NGCDHW, GKZYXC, NGKDHW, ck::Number<3>>,
-    std::tuple<ck::half_t, ck::half_t, ck::half_t, NGCDHW, GKZYXC, NGKDHW, ck::Number<3>>>;
+    std::tuple<ck::half_t, ck::half_t, ck::half_t, NGCDHW, GKZYXC, NGKDHW, ck::Number<3>>,
+    std::tuple<float, float, float, NGCDHW, GKCZYX, NGKDHW, ck::Number<3>>,
+    std::tuple<ck::bhalf_t, ck::bhalf_t, ck::bhalf_t, NGCDHW, GKCZYX, NGKDHW, ck::Number<3>>,
+    std::tuple<ck::half_t, ck::half_t, ck::half_t, NGCDHW, GKCZYX, NGKDHW, ck::Number<3>>>;
 
 TYPED_TEST_SUITE(TestGroupedConvndBwdWeight1d, KernelTypes1d);
 TYPED_TEST_SUITE(TestGroupedConvndBwdWeight2d, KernelTypes2d);
diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
index 25481e0d7f..1cf91df52c 100644
--- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
+++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
@@ -65,7 +65,10 @@ using KernelTypes2d = ::testing::Types<std::tuple<float, GNHWC, GKYXC, GNHWK>,
                                        std::tuple<float, NGCHW, GKYXC, NGKHW>,
                                        std::tuple<ck::half_t, NGCHW, GKYXC, NGKHW>,
                                        std::tuple<ck::bhalf_t, NGCHW, GKYXC, NGKHW>,
-                                       std::tuple<int8_t, NGCHW, GKYXC, NGKHW>>;
+                                       std::tuple<int8_t, NGCHW, GKYXC, NGKHW>,
+                                       std::tuple<float, NGCHW, GKCYX, NGKHW>,
+                                       std::tuple<ck::half_t, NGCHW, GKCYX, NGKHW>,
+                                       std::tuple<ck::bhalf_t, NGCHW, GKCYX, NGKHW>>;
 
 using KernelTypes3d = ::testing::Types<std::tuple<float, GNDHWC, GKZYXC, GNDHWK>,
                                        std::tuple<ck::half_t, GNDHWC, GKZYXC, GNDHWK>,
@@ -74,7 +77,9 @@ using KernelTypes3d = ::testing::Types<std::tuple<float, GNDHWC, GKZYXC, GNDHWK>
                                        std::tuple<float, NDHWGC, GKZYXC, NDHWGK>,
                                        std::tuple<ck::half_t, NDHWGC, GKZYXC, NDHWGK>,
                                        std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>,
-                                       std::tuple<int8_t, NDHWGC, GKZYXC, NDHWGK>>;
+                                       std::tuple<float, NGCDHW, GKCZYX, NGKDHW>,
+                                       std::tuple<ck::half_t, NGCDHW, GKCZYX, NGKDHW>,
+                                       std::tuple<ck::bhalf_t, NGCDHW, GKCZYX, NGKDHW>>;
 
 template <typename Tuple>
 class TestGroupedConvndFwd1d : public TestGroupedConvndFwd<Tuple>
diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_large_cases_xdl.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_large_cases_xdl.cpp
index 088fed89ff..d017a40bce 100644
--- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_large_cases_xdl.cpp
+++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_large_cases_xdl.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
@@ -83,6 +83,9 @@ TYPED_TEST(TestGroupedConvndFwd2d, Test2D)
     // When image is larger than 2GB
     this->conv_params.push_back(
         {2, 2, 2, 128, 128, {3, 3}, {4096, 2048}, {300, 300}, {3, 3}, {1, 1}, {1, 1}});
+    // Split N and G > 1
+    this->conv_params.push_back(
+        {2, 4, 112, 8, 8, {3, 3}, {469, 724}, {2, 2}, {2, 2}, {1, 1}, {1, 1}});
     this->template Run<2>();
 }
 
diff --git a/test/mx_mfma_op/mx_mfma_op.hpp b/test/mx_mfma_op/mx_mfma_op.hpp
index 78b43402de..45a3deabe2 100644
--- a/test/mx_mfma_op/mx_mfma_op.hpp
+++ b/test/mx_mfma_op/mx_mfma_op.hpp
@@ -31,48 +31,69 @@ enum class MFMA_F8F6F4
 
 };
 
-template <typename AFragT, typename BFragT, typename AccumFragT, int32_t BLOCK_M, int32_t BLOCK_N>
+template <int32_t BLOCK_M, int32_t BLOCK_N>
 struct mfma_type_selector;
 
-template <typename AFragT, typename BFragT, typename AccumFragT>
-struct mfma_type_selector<AFragT, BFragT, AccumFragT, 16, 16>
+template <>
+struct mfma_type_selector<16, 16>
 {
-    __device__ void operator()(AFragT const& fragA, BFragT const& fragB, AccumFragT& fragAcc)
+    template <typename AFragT, typename BFragT, typename AccumFragT>
+    __device__ static void run(AFragT const& fragA, BFragT const& fragB, AccumFragT& fragAcc)
     {
         auto op = mfma_type<MfmaInstr::mfma_f32_16x16x128f8f6f4>{};
-        op.template run<16, 16, AFragT, BFragT, AccumFragT>(fragA, fragB, fragAcc);
-    }
-
-    __device__ void operator()(AFragT const& fragA,
-                               const int32_t scale_a,
-                               BFragT const& fragB,
-                               const int32_t scale_b,
-                               AccumFragT& fragAcc)
-    {
-        auto op = mfma_type<MfmaInstr::mfma_scale_f32_16x16x128f8f6f4>{};
-        op.template run<16, 16, AFragT, BFragT, AccumFragT>(
-            fragA, scale_a, fragB, scale_b, fragAcc);
+        op.template run<16, 16>(fragA, fragB, fragAcc);
     }
 };
 
-template <typename AFragT, typename BFragT, typename AccumFragT>
-struct mfma_type_selector<AFragT, BFragT, AccumFragT, 32, 32>
+template <>
+struct mfma_type_selector<32, 32>
 {
-    __device__ void operator()(AFragT const& fragA, BFragT const& fragB, AccumFragT& fragAcc)
+    template <typename AFragT, typename BFragT, typename AccumFragT>
+    __device__ static void run(AFragT const& fragA, BFragT const& fragB, AccumFragT& fragAcc)
     {
         auto op = mfma_type<MfmaInstr::mfma_f32_32x32x64f8f6f4>{};
-        op.template run<32, 32, AFragT, BFragT, AccumFragT>(fragA, fragB, fragAcc);
+        op.template run<32, 32>(fragA, fragB, fragAcc);
     }
+};
 
-    __device__ void operator()(AFragT const& fragA,
-                               const int32_t scale_a,
+template <int32_t BLOCK_M, int32_t BLOCK_N>
+struct mfma_scale_type_selector;
+
+template <>
+struct mfma_scale_type_selector<16, 16>
+{
+    template <typename AFragT,
+              typename AScaleFragT,
+              typename BFragT,
+              typename BScaleFragT,
+              typename AccumFragT>
+    __device__ static void run(AFragT const& fragA,
+                               AScaleFragT const& scale_a,
                                BFragT const& fragB,
-                               const int32_t scale_b,
+                               BScaleFragT const& scale_b,
+                               AccumFragT& fragAcc)
+    {
+        auto op = mfma_type<MfmaInstr::mfma_scale_f32_16x16x128f8f6f4>{};
+        op.template run<16, 16>(fragA, scale_a[Number<0>{}], fragB, scale_b[Number<0>{}], fragAcc);
+    }
+};
+
+template <>
+struct mfma_scale_type_selector<32, 32>
+{
+    template <typename AFragT,
+              typename AScaleFragT,
+              typename BFragT,
+              typename BScaleFragT,
+              typename AccumFragT>
+    __device__ static void run(AFragT const& fragA,
+                               AScaleFragT const& scale_a,
+                               BFragT const& fragB,
+                               BScaleFragT const& scale_b,
                                AccumFragT& fragAcc)
     {
         auto op = mfma_type<MfmaInstr::mfma_scale_f32_32x32x64f8f6f4>{};
-        op.template run<32, 32, AFragT, BFragT, AccumFragT>(
-            fragA, scale_a, fragB, scale_b, fragAcc);
+        op.template run<32, 32>(fragA, scale_a[Number<0>{}], fragB, scale_b[Number<0>{}], fragAcc);
     }
 };
 
@@ -393,8 +414,7 @@ __device__ AFragT load_mx_A_row_major(AType const* input_ptr,
     // BLOCK_K / BLOCK_X is a stride in xA matrix
     auto startOffset = row_major(startCoord2D, BLOCK_K / BLOCK_X);
 
-    // obtain 8-bit exponent
-    fragX = utils::get_exponent_value(scale_ptr[startOffset]) & 0xFF;
+    fragX = scale_ptr[startOffset];
 
     return load_A_row_major<AType, AFragT, BLOCK_M, BLOCK_K>(input_ptr);
 }
@@ -634,7 +654,7 @@ __device__ BFragT load_mx_B_col_major(BType const* input_ptr,
     auto startOffset = col_major(startCoord2D, BLOCK_K / BLOCK_X);
 
     // obtain 8-bit exponent
-    fragX = utils::get_exponent_value(scale_ptr[startOffset]) & 0xFF;
+    fragX = scale_ptr[startOffset];
 
     return load_B_col_major<BType, BFragT, BLOCK_K, BLOCK_N>(input_ptr);
 }
@@ -909,7 +929,8 @@ __global__ void matmul(const AType* a, const BType* b, CType* c)
 
     // Matrix multiply-accumulate using MFMA units
     // Accumulation intermediate = BLOCK_M x BLOCK_N
-    mfma_type_selector<AFragT, BFragT, AccumFragT, BLOCK_M, BLOCK_N>{}(fragA, fragB, fragAcc);
+    using mfma = mfma_type_selector<BLOCK_M, BLOCK_N>;
+    mfma::template run<>(fragA, fragB, fragAcc);
 
     for(int i = 0; i < vectorSize(fragC); ++i)
     {
@@ -948,15 +969,16 @@ matmul(const AType* a, const ScaleType* xa, const BType* b, const ScaleType* xb,
     using CFragT        = vector_type<CType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
     using AccumFragT    = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>;
     using RawAccumFragT = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
-    using ScaleFragT    = int32_t;
+    using AScaleFragT   = vector_type<ScaleType, 1>::type;
+    using BScaleFragT   = vector_type<ScaleType, 1>::type;
 
     // Create frags
     auto fragA   = AFragT{};
     auto fragB   = BFragT{};
     auto fragC   = CFragT{};
     auto fragAcc = AccumFragT{0};
-    auto fragXa  = ScaleFragT{0};
-    auto fragXb  = ScaleFragT{0};
+    auto fragXa  = AScaleFragT{};
+    auto fragXb  = BScaleFragT{};
 
     // Load the inputs.
     fragA = load_mx_A_row_major<AType, AFragT, ScaleType, ScaleFragT, BLOCK_M, BLOCK_K, BLOCK_X>(
@@ -967,8 +989,12 @@ matmul(const AType* a, const ScaleType* xa, const BType* b, const ScaleType* xb,
 
     // Scaled Matrix multiply-accumulate using MFMA units
     // Accumulation intermediate = BLOCK_M x BLOCK_N
-    mfma_type_selector<AFragT, BFragT, AccumFragT, BLOCK_M, BLOCK_N>{}(
-        fragA, fragXa, fragB, fragXb, fragAcc);
+    using mfma = mfma_scale_type_selector<BLOCK_M, BLOCK_N>;
+    mfma::template run<>(fragA,
+                         fragXa.template AsType<ScaleType>(),
+                         fragB,
+                         fragXb.template AsType<ScaleType>(),
+                         fragAcc);
 
     for(int i = 0; i < vectorSize(fragC); ++i)
     {
diff --git a/tile_engine/CMakeLists.txt b/tile_engine/CMakeLists.txt
new file mode 100755
index 0000000000..cd1a192a74
--- /dev/null
+++ b/tile_engine/CMakeLists.txt
@@ -0,0 +1,5 @@
+include_directories(BEFORE
+        ${CMAKE_CURRENT_LIST_DIR}/include
+    )
+
+add_subdirectory(ops)
diff --git a/tile_engine/include/CMakeLists.txt b/tile_engine/include/CMakeLists.txt
new file mode 100755
index 0000000000..d11a4b3bee
--- /dev/null
+++ b/tile_engine/include/CMakeLists.txt
@@ -0,0 +1 @@
+message("Add include directory")
diff --git a/tile_engine/ops/CMakeLists.txt b/tile_engine/ops/CMakeLists.txt
new file mode 100755
index 0000000000..0cf2c16da2
--- /dev/null
+++ b/tile_engine/ops/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(gemm)
diff --git a/tile_engine/ops/gemm/CMakeLists.txt b/tile_engine/ops/gemm/CMakeLists.txt
new file mode 100644
index 0000000000..bc613a931e
--- /dev/null
+++ b/tile_engine/ops/gemm/CMakeLists.txt
@@ -0,0 +1,51 @@
+
+
+# generate a list of kernels, but not actually emit files at config stage
+execute_process(
+    COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
+            --working_path ${CMAKE_CURRENT_BINARY_DIR}
+            --json ${CMAKE_CURRENT_LIST_DIR}/configs/instance_combination.json
+            --list_blobs
+            RESULT_VARIABLE ret
+)
+set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS
+  ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
+  ${CMAKE_CURRENT_LIST_DIR}/configs/instance_combination.json
+)
+
+if(ret AND NOT ret EQUAL 0)
+  message( FATAL_ERROR "Fail to generate kernels via Python. ${ret}")
+endif()
+
+file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/gemm_instance_blobs.txt GEMM_CODEGEN_BLOBS)
+
+add_custom_command(
+    OUTPUT  ${GEMM_CODEGEN_BLOBS}
+    COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
+            --working_path ${CMAKE_CURRENT_BINARY_DIR}
+            --json ${CMAKE_CURRENT_LIST_DIR}/configs/instance_combination.json
+            --gen_blobs
+    DEPENDS ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
+            ${CMAKE_CURRENT_BINARY_DIR}/gemm_instance_blobs.txt
+            ${CMAKE_CURRENT_LIST_DIR}/configs/instance_combination.json
+)
+
+set(EXECUTABLE_GEMM_INSTANCE "tile_engine_gemm")
+message("adding example ${EXECUTABLE_GEMM_INSTANCE}")
+
+# use build as include directory
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+add_executable(${EXECUTABLE_GEMM_INSTANCE} EXCLUDE_FROM_ALL gemm_host_api.cpp)
+target_include_directories(${EXECUTABLE_GEMM_INSTANCE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+target_sources(${EXECUTABLE_GEMM_INSTANCE} PRIVATE ${GEMM_CODEGEN_BLOBS})
+
+set(EXECUTABLE_GEMM_INSTANCE_COMPILE_OPTIONS)
+
+list(APPEND EXECUTABLE_GEMM_INSTANCE_COMPILE_OPTIONS
+     -Wno-undefined-func-template
+     -Wno-float-equal
+     --offload-compress)
+
+target_compile_options(${EXECUTABLE_GEMM_INSTANCE} PRIVATE ${EXECUTABLE_GEMM_INSTANCE_COMPILE_OPTIONS})
+
+set_property(GLOBAL PROPERTY RULE_MESSAGES OFF)
\ No newline at end of file
diff --git a/tile_engine/ops/gemm/README.md b/tile_engine/ops/gemm/README.md
new file mode 100644
index 0000000000..f7d86e90fe
--- /dev/null
+++ b/tile_engine/ops/gemm/README.md
@@ -0,0 +1,92 @@
+# GEMM Matrix Multiplication
+
+CK Tile Engine GEMM is used to generate and run GEMM kernels with different combinations of BlockTile sizes, WarpTile sizes, WarpTile mapping for all valid pipelines, schedulers and epilogues. 
+
+# Kernel Configurations
+
+Kernel parameters are specified in the `instance_combination.json` file, including matrix layouts, data types, padding settings, pipelines, schedulers, epilogues, and numerical values for tile and warp sizes.
+
+Given a valid set of values, tile_engine_gemm will automatically iterate over all possible combinations of BlockTile and WarpTile sizes, as well as the specified pipelines, schedulers, and epilogues from `./configs/instance_combination.json`, and build the corresponding kernels.
+
+
+## Build Instructions
+``` bash
+# in the root of composable kernel create build directory
+mkdir build && cd build
+# build composable kernel
+sh ../script/cmake-ck-dev.sh  ../ <arch> # replace <arch> with the appropriate architecture (example gfx942) or leave blank
+# generate the executable
+make tile_engine_gemm -j
+```
+`tile_engine_gemm` will be located in the `./bin/` directory.
+
+_`tile_engine_gemm` must be rebuilt everytime `instance_combination.json` is modified._
+``` bash
+rm -rf tile_engine/ && make tile_engine_gemm -j  # rebuild
+```
+
+## tile_engine_gemm inputs
+```
+
+                  -m    m dimension (default:3840)
+                  -n    n dimension (default:4096)
+                  -k    k dimension (default:2048)
+           -stride_a    Tensor A stride (default:0)
+           -stride_b    Tensor B stride (default:0)
+           -stride_c    Tensor C stride (default:0)
+            -split_k    SplitK value (default:1)
+                  -v    No validation: 0, Validation on CPU: 1, Validation on GPU: 2 (default:2)
+             -warmup    Number of iterations before benchmark the kernel (default:50)
+             -repeat    Number of iterations to benchmark the kernel (default:100)
+              -timer    gpu:gpu timer, cpu:cpu timer (default:gpu)
+               -init    Value for initializing tensor- random: 0, linear: 1, constant(1): 2 (default:0)
+-structured_sparsity    Sparsity for tensor - 0:false, 1:true (default: 0)
+           -pipeline    possible values are: compv3, compv4, mem (default:compv3)
+          -scheduler    possible values are: intrawave, interwave (default:intrawave)
+           -epilogue    possible values are: cshuffle, default (default:cshuffle)
+              -pad_m    Pad in m direction - true/false (default:false)
+              -pad_n    Pad in n direction - true/false (default:false)
+              -pad_k    Pad in k direction - true/false (default:false)
+
+Note: pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be one of the options specified in instance_combination.json 
+```
+Note: In `./configs/instance_combination.json` pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be from one of the values specified above. 
+
+## Example
+
+The following JSON file specifies parameters used to generate and build GEMM kernels across all possible combinations of pipelines, schedulers, epilogues with different tile and warp sizes.
+
+```json
+{     
+    /// other parameters ///
+    
+    "tile_m": {
+      "values": [256]
+    },
+    "tile_n": {
+      "values": [256]
+    },
+    "tile_k": {
+      "values": [64, 32]
+    },
+
+    /// other parameters ///
+
+    "pipeline": {
+      "values": ["compv3", "compv4", "mem"]
+    },
+    "scheduler": {
+      "values": ["intrawave", "interwave"]
+    },
+    "epilogue": {
+      "values": ["default", "cshuffle"]
+    }
+}
+```
+
+At runtime, a specific subset of the generated kernels can be selected using command-line arguments.
+``` bash
+./bin/tile_engine_gemm -pipeline=compv3 -scheduler=intrawave -epilogue=default 
+```
+The above command runs kernels configured with the compv3 pipeline, intrawave scheduler, and default epilogue, while sweeping over different BlockTile sizes, WarpTile sizes, and WarpTile mappings.
+
diff --git a/tile_engine/ops/gemm/configs/instance_combination.json b/tile_engine/ops/gemm/configs/instance_combination.json
new file mode 100644
index 0000000000..66dbdafa11
--- /dev/null
+++ b/tile_engine/ops/gemm/configs/instance_combination.json
@@ -0,0 +1,60 @@
+{
+   
+    "layout_a": {
+      "values": ["r"]
+    },
+    "layout_b": {
+      "values": ["c"]
+    },
+    "layout_c": {
+      "values": ["r"]
+    },
+    "datatype": {
+      "values": ["fp16"]
+    },
+    "tile_m": {
+      "values": [256]
+    },
+    "tile_n": {
+      "values": [256]
+    },
+    "tile_k": {
+      "values": [64, 32]
+    },
+    "warp_m": {
+      "values": [2]
+    },
+    "warp_n": {
+      "values": [2]
+    },
+    "warp_k": {
+      "values": [1]
+    },
+    "warp_tile_m": {
+      "values": [32]
+    },
+    "warp_tile_n": {
+      "values": [32]
+    },
+    "warp_tile_k": {
+      "values": [16]
+    },
+    "kPadM": {
+      "values": [false]
+    },
+    "kPadN": {
+      "values": [false]
+    },
+    "kPadK": {
+      "values": [false]
+    },
+    "pipeline": {
+      "values": ["compv3", "compv4", "mem"]
+    },
+    "scheduler": {
+      "values": ["intrawave", "interwave"]
+    },
+    "epilogue": {
+      "values": ["default", "cshuffle"]
+    }
+}
diff --git a/tile_engine/ops/gemm/gemm_host_api.cpp b/tile_engine/ops/gemm/gemm_host_api.cpp
new file mode 100755
index 0000000000..a5447cd658
--- /dev/null
+++ b/tile_engine/ops/gemm/gemm_host_api.cpp
@@ -0,0 +1,192 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck_tile/host.hpp"
+#include "gemm_common.hpp"
+#include "gemm_dispatcher.hpp"
+#include "gemm_host_api.hpp"
+
+void gemm_kernel_launch(ck_tile::DeviceMem& c_m_n_dev_buf,
+                        ck_tile::HostTensor<CDataType>& c_m_n_host_result,
+                        ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+                        int verify,
+                        bool structured_sparsity,
+                        KernelTraits& trait,
+                        ck_tile::GemmHostArgs& args,
+                        const ck_tile::stream_config& stream)
+{
+    return GemmDispatcher::dispatch(c_m_n_dev_buf,
+                                    c_m_n_host_result,
+                                    c_m_n_dev_result,
+                                    verify,
+                                    structured_sparsity,
+                                    trait,
+                                    args,
+                                    stream);
+}
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+void run(const ck_tile::ArgParser& arg_parser)
+{
+    const ALayout a_layout = ALayout{};
+    const BLayout b_layout = BLayout{};
+
+    ck_tile::index_t kbatch = arg_parser.get_int("split_k");
+    ck_tile::index_t M      = arg_parser.get_int("m");
+    ck_tile::index_t N      = arg_parser.get_int("n");
+    ck_tile::index_t K      = arg_parser.get_int("k");
+
+    ck_tile::index_t stride_A = arg_parser.get_int("stride_a");
+    ck_tile::index_t stride_B = arg_parser.get_int("stride_b");
+    ck_tile::index_t stride_C = arg_parser.get_int("stride_c");
+
+    int n_warmup                 = arg_parser.get_int("warmup");
+    int n_repeat                 = arg_parser.get_int("repeat");
+    int verify                   = arg_parser.get_int("v");
+    ck_tile::index_t init_method = arg_parser.get_int("init");
+    bool structured_sparsity     = arg_parser.get_bool("structured_sparsity");
+
+    stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
+    stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
+    stride_C = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{}));
+
+    ck_tile::HostTensor<ADataType> a_m_k(
+        ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(a_layout)));
+    ck_tile::HostTensor<BDataType> b_k_n(
+        ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout)));
+    ck_tile::HostTensor<CDataType> c_m_n_dev_result(
+        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+
+    if(init_method == 0)
+    {
+        ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
+        ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
+    }
+    else if(init_method == 1)
+    {
+        ck_tile::FillMonotonicSeq<ADataType>{}(a_m_k);
+        ck_tile::FillMonotonicSeq<BDataType>{}(b_k_n);
+    }
+    else if(init_method == 2)
+    {
+        ck_tile::FillConstant<ADataType>{static_cast<ADataType>(1)}(a_m_k);
+        ck_tile::FillConstant<BDataType>{static_cast<BDataType>(1)}(b_k_n);
+    }
+    else
+    {
+        a_m_k.SetZero();
+        b_k_n.SetZero();
+    }
+
+    if(structured_sparsity)
+    {
+        ck_tile::AdjustToStructuredSparsity<ADataType>{}(a_m_k);
+    }
+
+    ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
+
+    if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+    {
+        // Permute vector pk_i4x4 data for device implementation
+        ck_tile::HostTensor<BDataType> b_k_n_dev = b_k_n;
+        // permute_tensor_b<decltype(b_k_n_dev)>(b_k_n_dev);
+        permute_vectors_i4x4_b(b_k_n_dev);
+        b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
+    }
+    else
+    {
+        b_k_n_dev_buf.ToDevice(b_k_n.data());
+    }
+
+    a_m_k_dev_buf.ToDevice(a_m_k.data());
+    c_m_n_dev_buf.SetZero();
+    c_m_n_dev_result.SetZero();
+
+    ck_tile::GemmHostArgs gemm_args;
+    gemm_args.a_ptr    = a_m_k_dev_buf.GetDeviceBuffer();
+    gemm_args.b_ptr    = b_k_n_dev_buf.GetDeviceBuffer();
+    gemm_args.c_ptr    = c_m_n_dev_buf.GetDeviceBuffer();
+    gemm_args.k_batch  = kbatch;
+    gemm_args.M        = M;
+    gemm_args.N        = N;
+    gemm_args.K        = K;
+    gemm_args.stride_A = stride_A;
+    gemm_args.stride_B = stride_B;
+    gemm_args.stride_C = stride_C;
+
+    KernelTraits trait;
+    trait.pipeline  = arg_parser.get_str("pipeline");
+    trait.scheduler = arg_parser.get_str("scheduler");
+    trait.epilogue  = arg_parser.get_str("epilogue");
+    trait.kPadM     = arg_parser.get_bool("pad_m");
+    trait.kPadN     = arg_parser.get_bool("pad_n");
+    trait.kPadK     = arg_parser.get_bool("pad_k");
+
+    std::cout << "Run Gemm kernel with M =" << M << " N =" << N << " K =" << K
+              << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C
+              << " A_Layout =" << ALayout::name << " B_Layout =" << BLayout::name
+              << " C_Layout =" << CLayout::name << " A Type = " << DataTypeTraits<ADataType>::name
+              << " B Type = " << DataTypeTraits<BDataType>::name
+              << " C Type = " << DataTypeTraits<CDataType>::name << std::endl;
+
+    ck_tile::HostTensor<CDataType> c_m_n_host_result(
+        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+
+    if(verify)
+    {
+        gemm_host_reference<ADataType,
+                            BDataType,
+                            AccDataType,
+                            CDataType,
+                            ALayout,
+                            BLayout,
+                            CLayout>(verify,
+                                     a_m_k,
+                                     b_k_n,
+                                     c_m_n_host_result,
+                                     a_m_k_dev_buf,
+                                     b_k_n_dev_buf,
+                                     M,
+                                     N,
+                                     K,
+                                     stride_A,
+                                     stride_B,
+                                     stride_C);
+    }
+
+    gemm_kernel_launch(c_m_n_dev_buf,
+                       c_m_n_host_result,
+                       c_m_n_dev_result,
+                       verify,
+                       structured_sparsity,
+                       trait,
+                       gemm_args,
+                       ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+
+    return;
+}
+
+int main(int argc, char* argv[])
+{
+    try
+    {
+        auto [result, parser] = create_args(argc, argv);
+        if(!result)
+            return EXIT_FAILURE;
+        run<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(parser);
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Error: " << e.what() << "\n";
+        return EXIT_FAILURE;
+    }
+}
diff --git a/tile_engine/ops/gemm/gemm_host_api.hpp b/tile_engine/ops/gemm/gemm_host_api.hpp
new file mode 100755
index 0000000000..579d2770db
--- /dev/null
+++ b/tile_engine/ops/gemm/gemm_host_api.hpp
@@ -0,0 +1,263 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include "ck_tile/ops/gemm.hpp"
+
+#pragma once
+
+template <typename T>
+struct DataTypeTraits;
+
+template <>
+struct DataTypeTraits<float>
+{
+    static constexpr const char* name = "fp32";
+};
+
+template <>
+struct DataTypeTraits<double>
+{
+    static constexpr const char* name = "fp64";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::half_t>
+{
+    static constexpr const char* name = "fp16";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::bf16_t>
+{
+    static constexpr const char* name = "bf16";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::fp8_t>
+{
+    static constexpr const char* name = "fp8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::bf8_t>
+{
+    static constexpr const char* name = "bf8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::pk_int4_t>
+{
+    static constexpr const char* name = "pk_int4_t";
+};
+
+/// @brief Defines the configuration parameters for a GEMM operation, enabling the selection of a
+/// specific kernel instance based on the provided settings.
+struct KernelTraits
+{
+    /// @brief The name of the pipeline.
+    std::string pipeline;
+    /// @brief The name of the scheduler (e.g., "intrawave", "interwave").
+    std::string scheduler;
+    /// @brief The name of the epilogue (e.g., "cshuffle", "default").
+    std::string epilogue;
+    /// @brief Indicates whether padding is applied to the M dimension.
+    bool kPadM;
+    /// @brief Indicates whether padding is applied to the N dimension.
+    bool kPadN;
+    /// @brief Indicates whether padding is applied to the K dimension.
+    bool kPadK;
+};
+
+template <typename Layout>
+static constexpr inline auto is_row_major(Layout layout_)
+{
+    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
+                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
+}
+
+template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeType =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
+    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
+        max_accumulated_value, kbatch);
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+inline auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3840", "m dimension")
+        .insert("n", "4096", "n dimension")
+        .insert("k", "2048", "k dimension")
+        .insert("stride_a", "0", "Tensor A stride")
+        .insert("stride_b", "0", "Tensor B stride")
+        .insert("stride_c", "0", "Tensor C stride")
+        .insert("split_k", "1", "splitK value")
+        .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
+        .insert("warmup", "50", "number of iterations before benchmark the kernel")
+        .insert("repeat", "100", "number of iterations to benchmark the kernel")
+        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
+        .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
+        .insert("structured_sparsity", "0", "0:false, 1:true")
+        .insert("pipeline", "compv3", "compv3, compv4, mem")
+        .insert("scheduler", "intrawave", "intrawave, interwave")
+        .insert("epilogue", "cshuffle", "cshuffle, default")
+        .insert("pad_m", "false", "true, false")
+        .insert("pad_n", "false", "true, false")
+        .insert("pad_k", "false", "true, false");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename Tensor>
+void permute_vectors_i4x4_b(Tensor& tensor)
+{
+    const ck_tile::index_t K = tensor.get_length(0);
+    const ck_tile::index_t N = tensor.get_length(1);
+    // vector pk_i4x4 permute
+    for(int i = 0; i < N; i++)
+    {
+        for(int j = 0; j < K; j += 8)
+        {
+            int8_t input[8];
+
+            for(int k = 0; k < 4; k++)
+            {
+                int8_t i4x2      = tensor(j + k * 2, i).data;
+                input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
+                input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
+            }
+
+            // permute 01234567->20643175
+            {
+                int8_t hi   = input[2];
+                int8_t lo   = input[0];
+                int8_t i4x2 = (hi << 4) | lo;
+
+                tensor(j + 0, i) = i4x2;
+            }
+
+            {
+                int8_t hi   = input[6];
+                int8_t lo   = input[4];
+                int8_t i4x2 = (hi << 4) | lo;
+
+                tensor(j + 2, i) = i4x2;
+            }
+
+            {
+                int8_t hi   = input[3];
+                int8_t lo   = input[1];
+                int8_t i4x2 = (hi << 4) | lo;
+
+                tensor(j + 4, i) = i4x2;
+            }
+
+            {
+                int8_t hi   = input[7];
+                int8_t lo   = input[5];
+                int8_t i4x2 = (hi << 4) | lo;
+
+                tensor(j + 6, i) = i4x2;
+            }
+        }
+    }
+}
+
+/// @brief Function to compare the results of the device and host computations
+void compare(ck_tile::index_t K,
+             ck_tile::index_t kbatch,
+             ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+             ck_tile::HostTensor<CDataType>& c_m_n_host_result)
+{
+    const float max_accumulated_value =
+        *std::max_element(c_m_n_host_result.mData.begin(), c_m_n_host_result.mData.end());
+    const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+        K, kbatch, max_accumulated_value);
+    bool pass = ck_tile::check_err(c_m_n_dev_result,
+                                   c_m_n_host_result,
+                                   "Error: Incorrect results!",
+                                   rtol_atol.at(ck_tile::number<0>{}),
+                                   rtol_atol.at(ck_tile::number<1>{}));
+
+    std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+              << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) << std::endl;
+    std::cout << "The verification result is:" << (pass ? "correct" : "fail") << std::endl;
+}
+
+/// @brief Function to get the kernel output with reference implementation on CPU/GPU
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+void gemm_host_reference(int verify,
+                         ck_tile::HostTensor<ADataType>& a_m_k,
+                         ck_tile::HostTensor<BDataType>& b_k_n,
+                         ck_tile::HostTensor<CDataType>& c_m_n_host_result,
+                         ck_tile::DeviceMem& a_m_k_dev_buf,
+                         ck_tile::DeviceMem& b_k_n_dev_buf,
+                         ck_tile::index_t M,
+                         ck_tile::index_t N,
+                         ck_tile::index_t K,
+                         ck_tile::index_t stride_A,
+                         ck_tile::index_t stride_B,
+                         ck_tile::index_t stride_C)
+{
+    if(verify == 1)
+    {
+        c_m_n_host_result.SetZero();
+
+        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
+            a_m_k, b_k_n, c_m_n_host_result);
+    }
+    else if(verify == 2)
+    {
+        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+        {
+            // Restore input for B for gpu reference
+            b_k_n_dev_buf.ToDevice(b_k_n.data());
+        }
+
+        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_host_result.get_element_space_size_in_bytes());
+        c_m_n_host_result.SetZero();
+        c_m_n_gpu_buf_ref.SetZero();
+
+        ADataType* d_A = static_cast<ADataType*>(a_m_k_dev_buf.GetDeviceBuffer());
+        BDataType* d_B = static_cast<BDataType*>(b_k_n_dev_buf.GetDeviceBuffer());
+        CDataType* d_C = static_cast<CDataType*>(c_m_n_gpu_buf_ref.GetDeviceBuffer());
+
+        ck_tile::reference_gemm_gpu<ADataType,
+                                    BDataType,
+                                    AccDataType,
+                                    CDataType,
+                                    ALayout,
+                                    BLayout,
+                                    CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C);
+
+        c_m_n_gpu_buf_ref.FromDevice(c_m_n_host_result.data());
+    }
+}
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
new file mode 100755
index 0000000000..b441bdd2d6
--- /dev/null
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -0,0 +1,644 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+# generate kernel instances to speed up compilation
+
+import argparse
+from enum import IntEnum
+from pathlib import Path
+import sys
+from typing import List, Optional, Dict, Any
+import functools
+import itertools
+import copy
+import json
+from dataclasses import dataclass
+ 
+DATA_TYPE_MAP = {'fp32'  : 'float',
+                 'fp16'  : 'ck_tile::half_t',
+                 'bf16'  : 'ck_tile::bf16_t',
+                 'int8'  : 'ck_tile::int8_t',
+                 'fp8'   : 'ck_tile::fp8_t',
+                 'bf8'   : 'ck_tile::bf8_t',
+                 'int4'  : 'ck_tile::pk_int4_t'
+                }
+
+LAYOUT_MAP = {'r' : 'ck_tile::tensor_layout::gemm::RowMajor',
+              'c' : 'ck_tile::tensor_layout::gemm::ColumnMajor'}                                       
+
+DEFAULT_EPILOGUE = """
+            using GemmEpilogue = ck_tile::DefaultGemm2DEpilogue<
+                                ck_tile::DefaultGemm2DEpilogueProblem<ADataType,
+                                                                      BDataType,
+                                                                      AccDataType, 
+                                                                      CDataType, 
+                                                                      CLayout, 
+                                                                      kPadM,
+                                                                      kPadN,
+                                                                      WarpTileM,
+                                                                      WarpTileN,
+                                                                      WarpTileK,
+                                                                      UniversalGemmProblem::TransposeC>>;
+"""
+
+CSHUFFLE_EPILOGUE = """
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                             BDataType,
+                                                             AccDataType,
+                                                             CDataType,
+                                                             CLayout,
+                                                             GemmPipelineProblem::kBlockSize,
+                                                             TilePartitioner::MPerBlock,
+                                                             TilePartitioner::NPerBlock,
+                                                             WarpM,
+                                                             WarpN,
+                                                             WarpTileM,
+                                                             WarpTileN,
+                                                             WarpTileK,
+                                                             UniversalGemmProblem::TransposeC>>;
+"""
+HOT_LOOP_FALSE = """
+            if(tail_num == ck_tile::TailNumber::Full)
+            {
+                Run(ck_tile::bool_constant<false>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+            }
+            else if(tail_num == ck_tile::TailNumber::Odd)
+            {
+                Run(ck_tile::bool_constant<false>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+            }
+            else if(tail_num == ck_tile::TailNumber::Even)
+            {
+                Run(ck_tile::bool_constant<false>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
+            }
+            else
+            {
+                throw std::runtime_error("Num K loop must be larger than number of prefetech stages.");
+            }  
+"""
+RUN_MEM = """
+            if(tail_num == ck_tile::TailNumber::One)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::One>{});
+            }
+            else if(tail_num == ck_tile::TailNumber::Full)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+            }
+
+            if constexpr(BaseGemmPipeline::PrefetchStages > 2)
+            {
+                if(tail_num == ck_tile::TailNumber::Two)
+                {
+                    Run(ck_tile::bool_constant<true>{},
+                        ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
+                }
+        
+                if(tail_num == ck_tile::TailNumber::Three)
+                {
+                    Run(ck_tile::bool_constant<true>{},
+                        ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
+                }
+                if(tail_num == ck_tile::TailNumber::Four)
+                {
+                    Run(ck_tile::bool_constant<true>{},
+                        ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{});
+                }
+                if(tail_num == ck_tile::TailNumber::Five)
+                {
+                    Run(ck_tile::bool_constant<true>{},
+                        ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{});
+                }
+                if(tail_num == ck_tile::TailNumber::Six)
+                {
+                    Run(ck_tile::bool_constant<true>{},
+                        ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{});
+                }
+                if(tail_num == ck_tile::TailNumber::Seven)
+                {
+                    Run(ck_tile::bool_constant<true>{},
+                        ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{});
+                }
+                throw std::runtime_error("The tile number is wrong! It should not exceed the prefetch stage numbers");
+            }
+"""
+
+RUN_COMPV3 = """
+            if(tail_num == ck_tile::TailNumber::Full)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+            }
+            else if(tail_num == ck_tile::TailNumber::Odd)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+            }
+            else if(tail_num == ck_tile::TailNumber::Even)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
+            }
+            else
+            {
+                throw std::runtime_error("The tail number is wrong. It should be Full, Odd, or Even.");
+            }
+"""
+
+RUN_COMPV4 = """
+            if(tail_num == ck_tile::TailNumber::Three)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
+            }
+            else
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
+            }
+"""
+
+
+PIPELINE_MAP = {'mem' : ['ck_tile::BaseGemmPipelineAgBgCrMem', 'ck_tile::GemmPipelineAgBgCrMem'],
+                'compv3' : ['ck_tile::BaseGemmPipelineAgBgCrCompV3', 'ck_tile::GemmPipelineAgBgCrCompV3'],
+                'compv4' : ['ck_tile::BaseGemmPipelineAgBgCrCompV4', 'ck_tile::GemmPipelineAgBgCrCompV4']}
+
+SCHEDULER_MAP = {'interwave' : 'ck_tile::GemmPipelineScheduler::Interwave',
+                 'intrawave' : 'ck_tile::GemmPipelineScheduler::Intrawave'}
+
+EPILOGUE_MAP = {'default' :DEFAULT_EPILOGUE,
+                'cshuffle' : CSHUFFLE_EPILOGUE}      
+
+HOT_LOOP_TRUE = {'mem' : RUN_MEM,
+                 'compv3' : RUN_COMPV3,
+                 'compv4' : RUN_COMPV4}    
+
+
+def BOOL_MAP(b_) -> str:
+    if b_:
+        return 'true'
+    else:
+        return 'false'
+
+@dataclass
+class GemmConfig:
+    def __init__(self, config_data):
+        self.matrix_cfg : Dict[str, Any] = {}
+        self.impl_cfg : Dict[str, Any] = {}
+        for key, value in config_data.items():
+            if key in ["datatype", "layout_a", "layout_b", "layout_c"]:
+                self.matrix_cfg[key] = value
+            else:
+                self.impl_cfg[key] = value
+    
+    @property
+    def datatype(self) -> str:
+        return self.matrix_cfg["datatype"]["values"][0]
+    
+    @property
+    def layouts(self) -> List[str]:
+        return [
+            self.matrix_cfg["layout_a"]["values"][0],
+            self.matrix_cfg["layout_b"]["values"][0],
+            self.matrix_cfg["layout_c"]["values"][0]
+        ]
+
+
+class GemmCodeGenerator:
+    def __init__(self, output_dir: str, config: GemmConfig):
+        self.output_dir = Path(output_dir)
+        if not self.output_dir.exists():
+            self.output_dir.mkdir()
+
+        self.config = config
+        self.all_kernels = []
+        self.unique_configs = [] 
+        # Validate configurations
+        self._validate_config()
+
+    def _validate_config(self):
+        """Validate matrix and implementation configurations"""
+        # Matrix config validation
+        for param in ["datatype", "layout_a", "layout_b", "layout_c"]:
+            if len(self.config.matrix_cfg[param]["values"]) != 1:
+                raise ValueError(f"Matrix config {param} must have exactly one value")
+        
+        # Implementation traits validation
+        required_params = ["tile_m", "tile_n", "tile_k", "warp_m", "warp_n", "warp_k",
+                          "warp_tile_m", "warp_tile_n", "warp_tile_k", "pipeline",
+                          "epilogue", "scheduler", "kPadM", "kPadN", "kPadK"]
+        for param in required_params:
+            if not self.config.impl_cfg.get(param, {}).get("values"):
+                raise ValueError(f"Missing implementation parameter: {param}")
+
+    def list_all(self):
+        """List all possible kernel configurations"""
+        w_p = Path(self.output_dir)
+        list_p = w_p / 'gemm_instance_blobs.txt'
+        self._list_config_groups()
+        with list_p.open('w') as list_f:
+            list_f.write(str(w_p / ("gemm_common.hpp"))  + "\n")
+            list_f.write(str(w_p / ("gemm_instances.hpp"))  + "\n")
+            list_f.write(str(w_p / ("gemm_dispatcher.hpp"))  + "\n")  
+            for group in self.all_kernels:
+                list_f.write(str(w_p / ("gemm_" + group + ".hpp")) + "\n")
+            
+
+
+    def _list_config_groups(self):
+        params = [
+            ("pipeline", "pipeline"),
+            ("epilogue", "epilogue"),
+            ("scheduler", "scheduler"),
+            ("kPadM", "kPadM"),
+            ("kPadN", "kPadN"), 
+            ("kPadK", "kPadK")
+        ]
+        
+        # Generate all unique_combinations
+        _unique = set(itertools.product(*[self.config.impl_cfg[p]["values"] for (p, _) in params]))
+        for combo in _unique:
+            config = {name: value for (_, name), value in zip(params, combo)}
+            pipeline, epilogue, scheduler, kPadM, kPadN, kPadK = config.values()
+            # To remove some unsupported combinations
+            unsupported_combination = [("compv3", "cshuffle", "interwave"),
+                                       ("compv3", "default", "interwave"),
+                                       ("compv4", "cshuffle", "interwave"),
+                                       ("compv4", "default", "interwave")]
+            if (pipeline, epilogue, scheduler) not in unsupported_combination:
+                group_name = f"{pipeline}_{epilogue}_{scheduler}_pad_{BOOL_MAP(kPadM)}_{BOOL_MAP(kPadN)}_{BOOL_MAP(kPadK)}"
+                self.all_kernels.append(group_name)
+                self.unique_configs.append(config)
+
+    def generate_all(self):
+        self._generate_common_header()
+        self._generate_config_groups()
+        self._generate_dispatcher()
+       
+
+    def _generate_common_header(self):
+        """Generate common header with datatypes and layout"""
+        ctype = self.config.datatype
+        atype = self.config.datatype
+        btype = self.config.datatype
+        if self.config.datatype in ['fp8', 'bf8']:
+            ctype = 'fp16'
+        elif self.config.datatype in ['int4']:
+            atype = 'fp16'
+            ctype = 'fp16'
+
+        content = f"""// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include "ck_tile/core.hpp"
+
+// Data types
+using ADataType = {DATA_TYPE_MAP[atype]};
+using BDataType = {DATA_TYPE_MAP[btype]};
+using AccDataType = float;
+using CDataType = {DATA_TYPE_MAP[ctype]};
+
+// Layout configurations
+using ALayout = {LAYOUT_MAP[self.config.layouts[0]]};
+using BLayout = {LAYOUT_MAP[self.config.layouts[1]]};
+using CLayout = {LAYOUT_MAP[self.config.layouts[2]]};
+"""
+        
+
+        (self.output_dir / "gemm_common.hpp").write_text(content)
+
+    def _generate_config_groups(self):
+        """Generate implementation configuration groups"""
+        if not self.unique_configs:  # Check if the list is empty
+            self._list_config_groups()
+        for config in self.unique_configs:
+            self._generate_config_group(**config)
+        self.generate_common_instances_header()
+
+    
+    def _generate_config_group(self, pipeline: str, epilogue: str, scheduler: str,
+                              kPadM: bool, kPadN: bool, kPadK: bool):
+        """Generate a configuration group with all tile/warp combinations"""
+        group_name = f"{pipeline}_{epilogue}_{scheduler}_pad_{BOOL_MAP(kPadM)}_{BOOL_MAP(kPadN)}_{BOOL_MAP(kPadK)}"
+        filename = f"gemm_{group_name}.hpp"
+
+        content = f"""// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gemm_common.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/host.hpp"
+
+namespace {group_name} {{
+"""
+        # Add template struct with configuration
+        content += self._generate_kernel_struct(pipeline, epilogue, scheduler, kPadM, kPadN, kPadK)
+
+        content += f"\n}} // namespace {group_name}\n"
+        (self.output_dir / filename).write_text(content)
+
+    def _generate_kernel_struct(self, pipeline: str, epilogue: str, scheduler: str,
+                               kPadM: bool, kPadN: bool, kPadK: bool) -> str:
+        """Generate kernel struct template"""
+        return f"""
+template <int TileM, int TileN, int TileK,
+          int WarpM, int WarpN, int WarpK,
+          int WarpTileM, int WarpTileN, int WarpTileK,
+          bool structured_sparsity>
+struct GemmKernel {{
+    static constexpr bool kPadM = {BOOL_MAP(kPadM)};
+    static constexpr bool kPadN = {BOOL_MAP(kPadN)};
+    static constexpr bool kPadK = {BOOL_MAP(kPadK)};
+
+    static float launch(ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) {{
+        static constexpr bool permuteA = false;
+        static constexpr bool permuteB = false;
+        static constexpr bool DoubleSmemBuffer ={"true" if pipeline == "compv4" else "false"};
+        static constexpr bool TransposeC = false;
+
+        static constexpr int kBlockPerCu                         = 1;
+        static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+        static constexpr ck_tile::index_t TileParitionerM01      = 4;
+
+        using GemmShape = 
+            ck_tile::TileGemmShape<ck_tile::sequence<TileM, TileN, TileK>,
+                                   ck_tile::sequence<WarpM, WarpN, WarpK>,
+                                   ck_tile::sequence<WarpTileM, WarpTileN, WarpTileK>,
+                                   permuteA,
+                                   permuteB>;
+
+
+        using TilePartitioner =
+            ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
+                                                      TileParitionerGroupNum,
+                                                      TileParitionerM01>;
+
+        using Traits  =
+            ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;        
+
+        using GemmUniversalTraits =
+            ck_tile::TileGemmUniversalTraits<kPadM, kPadN, kPadK, DoubleSmemBuffer,
+                                             ALayout, BLayout, CLayout, TransposeC, structured_sparsity>;    
+
+        using GemmPipelineProblem =
+            ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+
+        using BaseGemmPipeline = {PIPELINE_MAP[pipeline][0]}<GemmPipelineProblem>;  
+
+        const ck_tile::index_t k_grain     = args.k_batch * TileK;
+        const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * TileK;
+        const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+        const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);                                                                                                             
+
+        float ave_time{{0}};
+
+        const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {{
+            constexpr bool has_hot_loop_v = has_hot_loop_.value;
+            constexpr auto tail_number_v  = tail_number_.value;
+            constexpr auto scheduler      = {SCHEDULER_MAP[scheduler]};
+
+            using UniversalGemmProblem = 
+                ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                      BDataType,
+                                                      AccDataType,
+                                                      GemmShape,
+                                                      GemmUniversalTraits,
+                                                      scheduler,
+                                                      has_hot_loop_v,
+                                                      tail_number_v>;
+
+            using GemmPipeline = {PIPELINE_MAP[pipeline][1]}<UniversalGemmProblem>; 
+            {EPILOGUE_MAP[epilogue]}
+            using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
+
+            const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
+            constexpr dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {{
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!");
+            }}
+
+            if(s.log_level_ > 0)
+            {{
+                std::cout << "Launching kernel with args:"
+                      << " grid: {{" << grids.x << ", " << grids.y << ", " << grids.z << "}}"
+                      << ", blocks: {{" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}}"
+                      << std::endl;
+            }}
+
+            ave_time = ck_tile::launch_kernel(s,
+                                          ck_tile::make_kernel<blocks.x, kBlockPerCu>(
+                                              Kernel{{}}, grids, blocks, 0, kargs));
+            return ave_time;
+
+        }};
+
+        if(has_hot_loop) {{
+            {HOT_LOOP_TRUE[pipeline]}
+        }} else {{
+            {HOT_LOOP_FALSE}
+        }}
+
+        return ave_time;
+    }}
+    static std::string get_name() {{
+        return std::string("GemmKernel<Bllktile: ") + std::to_string(TileM) + "x" + std::to_string(TileN) + "x" + std::to_string(TileK) + ", " +
+                "WaveMap: " + std::to_string(WarpM) + "x" + std::to_string(WarpN) + "x" + std::to_string(WarpK) + ", " +
+                "WarpTile: " + std::to_string(WarpTileM) + "x" + std::to_string(WarpTileN) + "x" + std::to_string(WarpTileK) + ", " +
+                "PadidngM: " + "{kPadM}" + ", " +
+                "PaddingN: " + "{kPadN}" + ", " +
+                "PaddingK: " + "{kPadK}" + ", " +
+                "Pipeline: " + "{pipeline}" + ", " +
+                "Epilogue: " + "{epilogue}" + ", " +
+                "Scheduler: " + "{scheduler}";
+                }}
+}};
+"""
+
+    def generate_common_instances_header(self):
+        """Generate common instances header"""
+        content = """// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+"""
+        for group in self.all_kernels:
+            content += f"#include \"gemm_{group}.hpp\"\n"
+        (self.output_dir / "gemm_instances.hpp").write_text(content)
+
+    def _generate_dispatcher(self):
+        """Generate dispatch mechanism"""
+        content = """// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gemm_common.hpp"
+#include "gemm_instances.hpp"
+#include "gemm_host_api.hpp"
+#include <unordered_map>
+#include <functional>
+#include <vector>
+
+struct GemmDispatcher {
+    static auto& get_kernel_map() {
+        // Use a static local variable
+        static std::unordered_map<std::string, 
+            std::function<void(ck_tile::DeviceMem& c_m_n_dev_buf,
+                               ck_tile::HostTensor<CDataType>& c_m_n_host_result,
+                               ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+                               int verify, ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>> kernel_map;
+        return kernel_map;
+    }
+
+    static void init(bool structured_sparsity) {
+        auto& kernel_map = get_kernel_map();    
+        if(!kernel_map.empty()) return;
+        \n"""
+         # Add tile/warp instantiations
+        tile_params = set(itertools.product(
+            self.config.impl_cfg["tile_m"]["values"],
+            self.config.impl_cfg["tile_n"]["values"],
+            self.config.impl_cfg["tile_k"]["values"],
+            self.config.impl_cfg["warp_m"]["values"],
+            self.config.impl_cfg["warp_n"]["values"],
+            self.config.impl_cfg["warp_k"]["values"],
+            self.config.impl_cfg["warp_tile_m"]["values"],
+            self.config.impl_cfg["warp_tile_n"]["values"],
+            self.config.impl_cfg["warp_tile_k"]["values"]
+        ))
+
+        
+        for group in self.all_kernels:
+            content += f"""            kernel_map["{group}"] = [=](ck_tile::DeviceMem& c_m_n_dev_buf,
+                                                                  ck_tile::HostTensor<CDataType>& c_m_n_host_result,
+                                                                  ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+                                                                  int verify, ck_tile::GemmHostArgs& args,
+                                                                  const ck_tile::stream_config& stream) {{
+                        """
+            for tile in tile_params:
+                # Check if we have valid tile/warp combinations 
+                # (tile_m/(warp_m*warp_tile_m)) * warp_m * warp_tile_m == tile_m
+                if ((tile[0]/(tile[3] * tile[7]) * tile[3] * tile[7]) != tile[0]) or \
+                   ((tile[1]/(tile[4] * tile[8]) * tile[4] * tile[8]) != tile[1]):
+                    continue
+                content += f"""
+                if(structured_sparsity) {{
+                    run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {1}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);
+                }} else {{
+                    run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {0}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);
+                }}"""
+            content += f"""
+            }};\n"""
+
+        content += """    }
+    
+    template <typename Kernel>
+    static void run_kernel(ck_tile::DeviceMem& c_m_n_dev_buf,
+                           ck_tile::HostTensor<CDataType>& c_m_n_host_result,
+                           ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+                           int verify, ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream)
+    {
+        float avg_time = Kernel::launch(args, stream);
+        std::string description = Kernel::get_name();
+        c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
+        
+        std::size_t flop = std::size_t(2) * args.M * args.N * args.K;
+        std::size_t num_byte = sizeof(ADataType) * args.M * args.K + sizeof(BDataType) * args.N * args.K + sizeof(CDataType) * args.M * args.N;
+        float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+        float gb_per_sec = num_byte / 1.E6 / avg_time;
+
+        std::cout << "Performance for " << description << " : " << avg_time << " ms, "
+                << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
+
+        if(verify)
+            compare(args.K, args.k_batch, c_m_n_dev_result, c_m_n_host_result);
+        c_m_n_dev_buf.SetZero();
+        c_m_n_dev_result.SetZero();
+    }
+
+    static auto dispatch(ck_tile::DeviceMem& c_m_n_dev_buf,
+                         ck_tile::HostTensor<CDataType>& c_m_n_host_result,
+                         ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+                         int verify, bool structured_sparsity, const KernelTraits &trait, ck_tile::GemmHostArgs& gemm_args,
+                         const ck_tile::stream_config& stream) {
+        init(structured_sparsity);
+        const std::string key = assemble_key(trait);
+        auto& kernel_map = get_kernel_map(); 
+        if(auto it = kernel_map.find(key); it != kernel_map.end()) {
+            return it->second(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, gemm_args, stream); 
+        }
+        throw std::runtime_error("No suitable kernel found: " + key);
+    }
+
+private:
+    static std::string assemble_key(const KernelTraits &trait) {
+        return std::string(trait.pipeline) + "_" + 
+               trait.epilogue + "_" + 
+               trait.scheduler + "_" +
+               "pad_" + 
+               (trait.kPadM ? "true" : "false") + "_" +
+               (trait.kPadN ? "true" : "false") + "_" +
+               (trait.kPadK ? "true" : "false");
+    }
+};
+
+"""
+        (self.output_dir / "gemm_dispatcher.hpp").write_text(content)
+
+        
+def do_list_blobs(args, gemm_config):
+    generator = GemmCodeGenerator(args.working_path, gemm_config)
+    generator.list_all()
+
+def do_gen_blobs(args, gemm_config):
+    generator = GemmCodeGenerator(args.working_path, gemm_config)
+    generator.generate_all()
+
+     
+
+def main(args):
+    # Read json file
+    with open(args.json, 'r') as json_file:
+        config_data = json.load(json_file)
+    
+    gemm_config = GemmConfig(config_data)
+
+    if args.list_blobs:
+        do_list_blobs(args, gemm_config)
+    elif args.gen_blobs:
+        do_gen_blobs(args, gemm_config)
+    else:
+        # If neither was specified, either do nothing or default to gen_blobs
+        print("No mode specified (use --list_blobs or --gen_blobs). Generating by default...")
+        do_gen_blobs(args, gemm_config)
+   
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="generate",
+        description="gen API for CK gemm kernel",
+    )
+    parser.add_argument(
+        "-w", "--working_path", default="./", required=False, help="the path where all the blobs are going to be generated"
+    )
+    parser.add_argument(
+        "-j", "--json", required=True, help="Path to the json which contains the kernel configurations"
+    )
+    parser.add_argument(
+        "-l", "--list_blobs", action = 'store_true', help="List all kernel to file"
+    )
+    parser.add_argument(
+        "-g", "--gen_blobs", action = 'store_true', help="Generate all kernels into different files"
+    )
+    
+    args = parser.parse_args()
+    
+    main(args)